In [21]:
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [22]:
SEED = 41

In [23]:
df = pd.read_csv('icdc\\train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ben     1700 non-null   object
 1   guj     1700 non-null   object
 2   hin     1700 non-null   object
 3   kan     1700 non-null   object
 4   mal     1700 non-null   object
 5   ori     1700 non-null   object
 6   pan     1700 non-null   object
 7   tam     1700 non-null   object
 8   tel     1700 non-null   object
 9   urd     1700 non-null   object
 10  eng     1700 non-null   object
dtypes: object(11)
memory usage: 146.2+ KB


In [24]:
allTexts = ''
for i in tqdm(range(df.__len__())):
    allTexts += ''.join(df.iloc[i]).lower().replace('–','').replace('$','').replace('&','').replace('[','').replace(']',''
                                            ).replace('“','').replace('”','').replace('=','').replace('৷','').replace('`','').replace('ؑ', '').replace('}',''
                                            ).replace('-', '').replace('*', '').replace('^', '')

  0%|          | 0/1700 [00:00<?, ?it/s]

100%|██████████| 1700/1700 [00:01<00:00, 1632.05it/s]


In [25]:
hinglish_res = Counter(allTexts)
# sorted(list(dict(hinglish_res).items()), key = lambda x: x[1], reverse=True)
charsVocab = list(dict(hinglish_res).items())

In [26]:
PAD_NULL = '-'
PAD_START = '*'
PAD_END = '^'

vocab = [PAD_NULL, PAD_START, PAD_END]+[i[0] for i in charsVocab]

IDX_PAD_NULL = vocab.index(PAD_NULL)

len(vocab), IDX_PAD_NULL

(63, 0)

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau#, StepLR, ExponentialLR
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader, Subset, random_split


import numpy as np
import random
import os

In [28]:
def add_extraToken(texts, startToken=True, endToken=True):
    if startToken and endToken: return [PAD_START+text+PAD_END for text in texts]
    elif startToken: return [PAD_START+text for text in texts]
    elif endToken: return [text+PAD_END for text in texts]
    else: return texts

def remove_extraToken(texts:list[str])->list[str]:
    return [text.lower().replace('–','').replace('$','').replace('&','').replace('[','').replace(']',''
                                            ).replace('“','').replace('”','').replace('=','').replace('৷','').replace('`','').replace('ؑ', '').replace('}',''
                                            ).replace(PAD_START, '').replace(PAD_END, '').replace(PAD_NULL, '')
            for text in texts]

def preprocesser(texts: list[str], prePadding=False, startToken=True, endToken=True, batch_first=False):
    texts = add_extraToken(remove_extraToken(texts), startToken, endToken)
    text_ints = [[vocab.index(c) for c in text if c in vocab] for text in texts]
    # Apply pre-padding to each sequence
    if prePadding:
        max_length = max(len(seq) for seq in text_ints)
        padded_seqs = pad_sequence([torch.cat([torch.tensor([IDX_PAD_NULL]*(max_length - len(seq)), dtype=torch.int64), torch.LongTensor(seq)]) for seq in text_ints], batch_first=True)
    else:
        padded_seqs = pad_sequence([torch.LongTensor(seq) for seq in text_ints], batch_first=True, padding_value=IDX_PAD_NULL)
    
    return padded_seqs if batch_first else padded_seqs.T


preprocesser(['hiir', 'laksfffh'], startToken=True, endToken=False)

tensor([[ 1,  1],
        [14, 19],
        [ 4, 13],
        [ 4, 17],
        [12,  3],
        [ 0, 36],
        [ 0, 36],
        [ 0, 36],
        [ 0, 14]])

In [29]:
class CustomDataset(Dataset):
    def __init__(self, batch_size=64):
        dataset = []

        for y, col in enumerate(df.columns):
            for i in range(df[col].__len__()):
                text = df[col].iloc[i].lower().replace('–','').replace('$','').replace('&','').replace('[','').replace(']',''
                                            ).replace('“','').replace('”','').replace('=','').replace('৷','').replace('`','').replace('ؑ', '').replace('}',''
                                            ).replace(PAD_START, '').replace(PAD_END, '').replace(PAD_NULL, '')
                dataset.append((text, y, df[col].iloc[i]))
        
        dataset.sort(key=lambda x: len(x[0]))
        
        self.batched = []
        for i in range(0, len(dataset), batch_size): self.batched.append(self.custom_collate_fn(dataset[i:i+batch_size]))
    
    def custom_collate_fn(self, batch):
        x = []
        y = []
        real = []
        for ix, iy, ireal in batch:
            x.append(ix)
            y.append(iy)
            real.append(ireal)
        return preprocesser(x), F.one_hot(torch.tensor(y), num_classes=11).to(torch.float32), real

    def __len__(self):
        return len(self.batched)
    
    def __getitem__(self, idx):
        # Return a single sequence and its label
        return self.batched[idx]

# Create a DataLoader with batch size 64
custom_dataset = CustomDataset(batch_size=64)  # Create an instance of the custom dataset
data_loader = DataLoader(custom_dataset, batch_size=1, shuffle=True)

torch.manual_seed(SEED)
# Iterate through the DataLoader
for batch in data_loader:
    sequences, labels, _ = batch
    sequences.squeeze_(0)
    labels.squeeze_(0)
    break

In [30]:
class Encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_size, num_layers, vocab_size, p=0, num_classes=11):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=p, bidirectional=False) 
        # self.fc1 = nn.Linear(hidden_size, hidden_size*2)
        # self.fc2 = nn.Linear(hidden_size*2, num_classes)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.num_layers = num_layers
        self.hidden_size = hidden_size

    def forward(self, x):
        # (sequencen x batch_size)
        x = self.dropout(self.embedding(x)) # (sequencen x batch_size x embedding_dim)
        outputs, (hidden, cell) = self.lstm(x) # (sequencen x batch_size x hidden_size), ((num_layers x batch_size x hidden_size), (num_layers x batch_size x hidden_size))
        return self.fc(outputs[-1])
        # x = F.relu(self.fc1(outputs[-1]))
        # return self.fc2(x)


# Create an LSTM model
# model = Encoder(50, 128, 2, vocab_size=len(vocab)).to(DEVICE)
# x = sequences
# y = labels
# print(x.shape)
# model(x).shape

In [31]:
# hyperparameters
LR = 0.001
EMBEDDING_SIZE = 50
HIDDEN_SIZE = 128
NUM_LAYERS = 2
P = 0.5
BATCH_SIZE = 64
EPOCHS = 100
TRAIN_SIZE = .8
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [32]:
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def accuracy(model, data_loader):
    # Set the model to evaluation mode
    model.eval()

    correct = 0
    total = 0
    
    # Disable gradient computation during inference
    for (sequences, labels, _) in data_loader: # test_loader
        sequences = sequences.squeeze(0).to(DEVICE)
        labels = labels.squeeze(0).to(DEVICE).argmax(dim=1)
        # Forward pass
        predicted = model(sequences).argmax(dim=1)
            
        # Count total number of labels
        total += labels.size(0)
        
        # Count number of correct predictions
        correct += (predicted == labels).sum().item()
    model.train()
    # Calculate accuracy
    return 100 * correct / total
    # print('Accuracy: {:.2f}%'.format(accuracy))

In [33]:
# Create a DataLoader with batch size 64
custom_dataset = CustomDataset(BATCH_SIZE)

train_size = int(0.8 * len(custom_dataset))
test_size = len(custom_dataset) - train_size

train_dataset, test_dataset = random_split(custom_dataset, [train_size, test_size], generator=torch.Generator().manual_seed(SEED))

# Create data loaders for train and test sets
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

model = Encoder(EMBEDDING_SIZE, HIDDEN_SIZE, NUM_LAYERS, vocab_size=len(vocab), p=P, num_classes=11).to(device=DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
scheduler = ReduceLROnPlateau(optimizer, patience=10)
criterion = nn.CrossEntropyLoss()

In [16]:
# Load the model
model.load_state_dict(torch.load('models_icdc\\gru.model.pth'))

<All keys matched successfully>

In [34]:
model.train()
start = time.time()
for epoch in range(EPOCHS):
    total_loss = 0
    # Iterate through the DataLoader
    model.train()
    for (sequences, labels, _) in train_loader:
        sequences = sequences.squeeze(0).to(DEVICE)
        labels = labels.squeeze(0).to(DEVICE)
        
        output = model(sequences)
        
        loss = criterion(output, labels)
        total_loss += loss.item()
    
        model.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validation phase
    valid_loss = 0
    model.eval()
    with torch.no_grad():
        for (sequences, labels, _) in test_loader:
            sequences = sequences.squeeze(0).to(DEVICE)
            labels = labels.squeeze(0).to(DEVICE)
        
            output = model(sequences)
        
            loss = criterion(output, labels)
            valid_loss += loss.item()
        
    print('[{}] Train Epoch: [{}/{}] \tLoss: {:.2f} Test Loss: {:.2f}'.format(
            time_since(start), epoch, EPOCHS,
            total_loss, valid_loss*len(train_loader)/len(test_loader)))
    
    scheduler.step(valid_loss)

[0m 3s] Train Epoch: [0/100] 	Loss: 418.15 Test Loss: 307.27
[0m 5s] Train Epoch: [1/100] 	Loss: 273.46 Test Loss: 212.90
[0m 7s] Train Epoch: [2/100] 	Loss: 230.30 Test Loss: 184.81
[0m 9s] Train Epoch: [3/100] 	Loss: 196.22 Test Loss: 153.47
[0m 11s] Train Epoch: [4/100] 	Loss: 166.34 Test Loss: 123.70
[0m 13s] Train Epoch: [5/100] 	Loss: 149.79 Test Loss: 113.85
[0m 15s] Train Epoch: [6/100] 	Loss: 133.85 Test Loss: 101.53
[0m 16s] Train Epoch: [7/100] 	Loss: 122.61 Test Loss: 92.15
[0m 18s] Train Epoch: [8/100] 	Loss: 110.83 Test Loss: 90.33
[0m 20s] Train Epoch: [9/100] 	Loss: 104.45 Test Loss: 77.36
[0m 21s] Train Epoch: [10/100] 	Loss: 94.91 Test Loss: 72.82
[0m 23s] Train Epoch: [11/100] 	Loss: 87.04 Test Loss: 72.14
[0m 25s] Train Epoch: [12/100] 	Loss: 77.93 Test Loss: 62.44
[0m 26s] Train Epoch: [13/100] 	Loss: 74.27 Test Loss: 67.58
[0m 28s] Train Epoch: [14/100] 	Loss: 69.72 Test Loss: 60.27
[0m 30s] Train Epoch: [15/100] 	Loss: 64.76 Test Loss: 58.71
[0m 31s] Train Epoch:

In [81]:
# Save the model
torch.save(model.state_dict(), 'models_icdc\\gru.model.pth')

In [36]:
print('Train Accuracy: {:.2f}%'.format(accuracy(model, train_loader)))
print('Test Accuracy: {:.2f}%'.format(accuracy(model, test_loader)))

Train Accuracy: 99.96%
Test Accuracy: 96.56%


# OTHER BEST MODELS

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB # best
from sklearn.svm import SVC # best
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pickle

In [19]:
X_train, y_train = [], []
for (_, labels, real) in train_loader:
    X_train += [i[0] for i in real]
    y_train += labels.squeeze(0).argmax(dim=1).numpy().tolist()
    
X_test, y_test = [], []
for (_, labels, real) in test_loader:
    X_test += [i[0] for i in real]
    y_test += labels.squeeze(0).argmax(dim=1).numpy().tolist()

vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [38]:
# load 
with open('models_icdc\\vectorizer.states.pkl','rb') as f: 
    vectorizer = pickle.load(f)

In [None]:
# save
with open('models_icdc\\vectorizer.states.pkl','wb') as f: pickle.dump(vectorizer,f)

# LogisticRegression

In [49]:
# load
with open('models_icdc\\lr.model.pkl', 'rb') as f:
    lr_classifier = pickle.load(f)
pred_lr = lr_classifier.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, lr_classifier.predict(X_test_vect)))

Accuracy: 0.9679555084745762


In [None]:
lr_classifier = LogisticRegression(max_iter=300)
lr_classifier.fit(X_train_vect, y_train)
pred_lr = lr_classifier.predict(X_test_vect)

with open('models_icdc\\lr.model.pkl','wb') as f: pickle.dump(lr_classifier,f)
print("Accuracy:", accuracy_score(y_test, pred_lr))

# naive_bayes

In [51]:
# load
with open('models_icdc\\nb.model.pkl', 'rb') as f:
    nb_classifier = pickle.load(f)
pred_nb = nb_classifier.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, nb_classifier.predict(X_test_vect)))

Accuracy: 0.9769597457627118


In [95]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vect, y_train)
pred_nb = nb_classifier.predict(X_test_vect)
with open('models_icdc\\nb.model.pkl','wb') as f: pickle.dump(nb_classifier,f)
print("Accuracy:", accuracy_score(y_test, pred_nb))

Accuracy: 0.9769597457627118


# RandomForestClassifier

In [52]:
# load
with open('models_icdc\\rf.model.pkl', 'rb') as f:
    rf_classifier = pickle.load(f)
pred_rf = rf_classifier.predict(X_test_vect)
print("RandomForestClassifier F1 score: ", f1_score(y_test, rf_classifier.predict(X_test_vect), average='weighted'))

RandomForestClassifier F1 score:  0.9245796017973961


In [97]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_vect, y_train)
pred_rf = rf_classifier.predict(X_test_vect)
with open('models_icdc\\rf.model.pkl','wb') as f: pickle.dump(rf_classifier,f)
print("RandomForestClassifier F1 score: ", f1_score(y_test, pred_rf, average='weighted'))

RandomForestClassifier F1 score:  0.9245796017973961


# XGBClassifier

In [102]:
from xgboost import XGBClassifier

In [53]:
# load
with open('models_icdc\\xgb.model.pkl', 'rb') as f:
    xgb_classifier = pickle.load(f)
pred_xgb = xgb_classifier.predict(X_test_vect)
print("XGBClassifier F1 score: ", f1_score(y_test, xgb_classifier.predict(X_test_vect), average='weighted'))

XGBClassifier F1 score:  0.9020493021687092


In [104]:
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train_vect, y_train)
pred_xgb = xgb_classifier.predict(X_test_vect)
with open('models_icdc\\xgb.model.pkl','wb') as f: pickle.dump(xgb_classifier,f)
print("XGBClassifier F1 score: ", f1_score(y_test, pred_xgb, average='weighted'))

XGBClassifier F1 score:  0.9020493021687092


# SVG

In [54]:
# load
with open('models_icdc\\svm.model.pkl', 'rb') as f:
    svm_model = pickle.load(f)
pred_SVM = svm_model.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, svm_model.predict(X_test_vect)))

Accuracy: 0.9690148305084746


In [106]:
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_vect, y_train)
pred_SVM = svm_model.predict(X_test_vect)
with open('models_icdc\\svm.model.pkl','wb') as f: pickle.dump(svm_model,f)
print("Accuracy:", accuracy_score(y_test, pred_SVM))

Accuracy: 0.9690148305084746


# DecisionTreeClassifier

In [55]:
# load
with open('models_icdc\\dtc.model.pkl', 'rb') as f:
    DTC = pickle.load(f)
pred_DTC=DTC.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, DTC.predict(X_test_vect)))

Accuracy: 0.848781779661017


In [108]:
DTC=DecisionTreeClassifier()
DTC.fit(X_train_vect,y_train)
pred_DTC=DTC.predict(X_test_vect)
with open('models_icdc\\dtc.model.pkl','wb') as f: pickle.dump(DTC,f)
print("Accuracy:", accuracy_score(y_test,pred_DTC))

Accuracy: 0.848781779661017


# scores

In [56]:
# GRU
model.eval()
test_gru = []
pred_gru = []
with torch.no_grad():    
    for (sequences, labels, _) in test_loader: # test_loader
        sequences = sequences.squeeze(0).to(DEVICE)
        labels = labels.squeeze(0).to(DEVICE).argmax(dim=1)
        # Forward pass
        predicted = model(sequences).argmax(dim=1)
        test_gru.append(labels.cpu().numpy())
        pred_gru.append(predicted.cpu().numpy())
model.train()
test_gru = np.concatenate(test_gru, axis = 0)
pred_gru = np.concatenate(pred_gru, axis = 0)


def model_predict(X):
    model.eval()
    with torch.no_grad():
        outputs =  model(preprocesser(X).to(DEVICE)).cpu().numpy()
    model.train()
    return outputs

In [57]:
print("Logistic Regression F1 score: ", f1_score(y_test, pred_lr,average='weighted'))
print("Naive Bayes F1 score: ", f1_score(y_test, pred_nb,average='weighted'))
print("SVM F1 score: ", f1_score(y_test, pred_SVM,average='weighted'))
print("Decission Tree Classifier F1 score: ",f1_score(y_test, pred_DTC,average='weighted'))
print("GRU F1 score: ", f1_score(test_gru, pred_gru,average='weighted'))
print()
print("Logistic Regression Accuracy: ", accuracy_score(y_test, pred_lr))
print("Naive Bayes Accuracy: ", accuracy_score(y_test, pred_nb))
print("SVM Accuracy: ", accuracy_score(y_test, pred_SVM))
print("Decission Tree Classifier Accuracy: ",accuracy_score(y_test, pred_DTC))
print("GRU Accuracy: ", accuracy_score(test_gru, pred_gru))
print()
print("Logistic Regression MSE: ", mean_squared_error(y_test, pred_lr))
print("Naive Bayes MSE: ", mean_squared_error(y_test, pred_nb))
print("SVM MSE: ", mean_squared_error(y_test, pred_SVM))
print("Decission Tree Classifier MSE: ",mean_squared_error(y_test, pred_DTC))
print("GRU MSE: ", mean_squared_error(test_gru, pred_gru))

Logistic Regression F1 score:  0.9680349579067693
Naive Bayes F1 score:  0.9767645507998631
SVM F1 score:  0.9691697667143868
Decission Tree Classifier F1 score:  0.8520085722918403
GRU F1 score:  0.9656513890863466

Logistic Regression Accuracy:  0.9679555084745762
Naive Bayes Accuracy:  0.9769597457627118
SVM Accuracy:  0.9690148305084746
Decission Tree Classifier Accuracy:  0.848781779661017
GRU Accuracy:  0.965572033898305

Logistic Regression MSE:  0.816207627118644
Naive Bayes MSE:  0.5238347457627118
SVM MSE:  0.7915783898305084
Decission Tree Classifier MSE:  4.112023305084746
GRU MSE:  0.8442796610169492


In [123]:
# NORMALIZE and MAKE between 0 and 1
def prob(arr:np.ndarray, gap_adjuster:int=3)->np.ndarray:
    if len(arr.shape) == 1:
        arr = (arr-arr.min())/(arr.max()-arr.min())
        if gap_adjuster!=1: arr = arr**gap_adjuster
        return arr/arr.sum()
    else:
        arr = (arr-arr.min(axis=1).reshape(-1, 1))/(arr.max(axis=1)-arr.min(axis=1)).reshape(-1, 1)
        if gap_adjuster!=1: arr = arr**gap_adjuster
        return arr/arr.sum(axis=1).reshape(-1, 1)

In [59]:
["{:.4f} {:.4f} {:.4f}".format(i1, i2, i3) for i1, i2, i3 in zip(
        lr_classifier.predict_proba(X_test_vect[:1])[0], 
        nb_classifier.predict_proba(X_test_vect[:1])[0],
        svm_model.predict_proba(X_test_vect[:1])[0]
    )]

['0.0020 0.0058 0.0000',
 '0.0018 0.0057 0.0000',
 '0.9682 0.8851 1.0000',
 '0.0022 0.0036 0.0000',
 '0.0028 0.0047 0.0000',
 '0.0028 0.0064 0.0000',
 '0.0013 0.0054 0.0000',
 '0.0026 0.0048 0.0000',
 '0.0043 0.0067 0.0000',
 '0.0093 0.0609 0.0000',
 '0.0025 0.0108 0.0000']

In [60]:
model.eval()
with torch.no_grad():
    tmp = model(sequences).cpu().numpy()

In [61]:
["{:.4f} {:.4f}".format(i1, i2) for i1, i2 in zip(prob(tmp[0]).tolist(), tmp[0])]

['0.0000 -5.5980',
 '0.0027 -2.5017',
 '0.0001 -4.5728',
 '0.0104 -0.7229',
 '0.0281 1.1952',
 '0.0012 -3.2496',
 '0.0006 -3.6695',
 '0.9169 16.1029',
 '0.0086 -1.0236',
 '0.0061 -1.5134',
 '0.0253 0.9629']

In [109]:
def emsemble_infer_v1(texts:str|list[str], printable=False):
    if isinstance(texts, str): texts = [texts]
    output = (
        lr_classifier.predict_proba(vectorizer.transform(texts)) +
        nb_classifier.predict_proba(vectorizer.transform(texts)) + 
        svm_model.predict_proba(vectorizer.transform(texts)) + 
        prob(model_predict(texts))
    ).argmax(axis=1)
    if printable:
        return [['ben', 'guj', 'hin', 'kan', 'mal', 'ori', 'pan', 'tam', 'tel', 'urd', 'eng'][i] for i in output.tolist()]
    else:
        return output
    
    
emsemble_infer_v1('alute masala makhie, fetano basena chubie nie dubo tele bhaja yatakshan na bhalo kare bhaja hachche, tiri kara has maharashtrer ei suswadu o janapriya khavarer pad.', 
                  printable=True)

['ben']

In [110]:
pred_emsemble_v1 = []
for i in tqdm(range(0, len(X_test), 64)):
    pred_emsemble_v1.append(emsemble_infer_v1(X_test[i:i+64]))

pred_emsemble_v1 = np.concatenate(pred_emsemble_v1, axis = 0)

print("F1 score: ", f1_score(y_test, pred_emsemble_v1,average='weighted'))
print("Accuracy: ", accuracy_score(y_test, pred_emsemble_v1))
print("MSE: ", mean_squared_error(y_test, pred_emsemble_v1))

100%|██████████| 59/59 [00:06<00:00,  8.87it/s]

F1 score:  0.9870164293079388
Accuracy:  0.9870233050847458
MSE:  0.4059851694915254





In [86]:
def emsemble_infer_v2(texts:str|list[str], printable=False):
    if isinstance(texts, str): texts = [texts]
    output = (
        prob(lr_classifier.predict_proba(vectorizer.transform(texts)), gap_adjuster=1) + 
        prob(nb_classifier.predict_proba(vectorizer.transform(texts)), gap_adjuster=1) + 
        # prob(svm_model.predict_proba(vectorizer.transform(texts)), gap_adjuster=1) + 
        prob(model_predict(texts), gap_adjuster=6)
    ).argmax(axis=1)
    if printable:
        return [['ben', 'guj', 'hin', 'kan', 'mal', 'ori', 'pan', 'tam', 'tel', 'urd', 'eng'][i] for i in output.tolist()]
    else:
        return output

In [87]:
pred_emsemble_v2 = []
for i in tqdm(range(0, len(X_test), 64)):
    pred_emsemble_v2.append(emsemble_infer_v2(X_test[i:i+64]))

pred_emsemble_v2 = np.concatenate(pred_emsemble_v2, axis = 0)

print("F1 score: ", f1_score(y_test, pred_emsemble_v2, average='weighted'))
print("Accuracy: ", accuracy_score(y_test, pred_emsemble_v2))
print("MSE: ", mean_squared_error(y_test, pred_emsemble_v2))

100%|██████████| 59/59 [00:01<00:00, 39.53it/s]

F1 score:  0.9896687963030091
Accuracy:  0.9896716101694916
MSE:  0.3061440677966102





In [228]:
emsemble_infer_v2(["m mase kono ullekhayogya tapapravaher dasha anubhav kara yyani.", 'tum kya kar rahe ho yaar?', 'can you do somethig for me?'], printable=True)

['ben', 'hin', 'eng']

In [116]:
def emsemble_infer_v3_last(texts:str|list[str], printable=False, proba=False):
    if isinstance(texts, str): texts = [texts]
    output = (
        lr_classifier.predict_proba(vectorizer.transform(texts)) +
        nb_classifier.predict_proba(vectorizer.transform(texts)) +
        prob(model_predict(texts), gap_adjuster=1)
    )
    if proba: 
        return output
    if printable:
        return [['ben', 'guj', 'hin', 'kan', 'mal', 'ori', 'pan', 'tam', 'tel', 'urd', 'eng'][i] for i in output.argmax(axis=1).tolist()]
    else:
        return output.argmax(axis=1)
    
    
emsemble_infer_v3_last('alute masala makhie, fetano basena chubie nie dubo tele bhaja yatakshan na bhalo kare bhaja hachche, tiri kara has maharashtrer ei suswadu o janapriya khavarer pad.', 
                  printable=True)

['ben']

In [112]:
pred_emsemble_v3 = []
for i in tqdm(range(0, len(X_test), 64)):
    pred_emsemble_v3.append(emsemble_infer_v3_last(X_test[i:i+64]))

pred_emsemble_v3 = np.concatenate(pred_emsemble_v3, axis = 0)

print("F1 score: ", f1_score(y_test, pred_emsemble_v3,average='weighted'))
print("Accuracy: ", accuracy_score(y_test, pred_emsemble_v3))
print("MSE: ", mean_squared_error(y_test, pred_emsemble_v3))

100%|██████████| 59/59 [00:01<00:00, 39.53it/s]

F1 score:  0.9907243664756155
Accuracy:  0.9907309322033898
MSE:  0.3003177966101695





In [113]:
def get_class(idx): return ['ben', 'guj', 'hin', 'kan', 'mal', 'ori', 'pan', 'tam', 'tel', 'urd', 'eng'][idx]

In [114]:
for inp, real, pred in zip(X_test, y_test, pred_emsemble_v3):
    if real != pred:
        # if get_class(real) not in ['hin', 'urd'] and get_class(pred) not in ['hin', 'urd']:
        print(f"Error: `{get_class(real)}` but model gives `{get_class(pred)}`")
        print(inp)
        print()

Error: `urd` but model gives `hin`
tum mushkil se padhaai karte ho.

Error: `hin` but model gives `urd`
mujhe pichhali class main bataai gai kuch nadiyaan yaad hai.

Error: `ori` but model gives `guj`
tame kouthiki yaythil?

Error: `urd` but model gives `hin`
aaj ke liye itanaa hi.

Error: `pan` but model gives `urd`
awchha, kallh 14 april hai hain naame?

Error: `tel` but model gives `ori`
i madhya evaina bike ridlaki valelava?

Error: `mal` but model gives `tel`
mashru broun riso chican kuino birianio polayullav.

Error: `hin` but model gives `urd`
aap jis dish kii baat kar rahi hai, use peetha yaa kholaa peetha kehete hai.

Error: `hin` but model gives `urd`
compozit package main is sab ke saath jaigarh or royal senotafa shaamil hai.

Error: `mal` but model gives `tel`
sindagi na milengi dobara, del chahta hai ocaillae entu road trippa?

Error: `mal` but model gives `kan`
ill, avarentin buddhimutant?

Error: `guj` but model gives `tam`
baas, tyano anand man.

Error: `guj` but model 

In [122]:
[(i, j) for i, j in zip(emsemble_infer_v3_last("Naanu indhu Kannadadalli neevu yava sahaya vanavu bekaadaru neeDabahudu. Neenu nannaNNu yavudu keLidaru naanu nimge sahaya maadutteene.", proba=True)[0], ['ben', 'guj', 'hin', 'kan', 'mal', 'ori', 'pan', 'tam', 'tel', 'urd', 'eng'])]

[(0.08851273813616918, 'ben'),
 (0.15902469111992074, 'guj'),
 (0.11512846096708404, 'hin'),
 (1.5802366012509674, 'kan'),
 (0.20267497443752558, 'mal'),
 (0.17614189506499442, 'ori'),
 (0.05903100059979048, 'pan'),
 (0.21048618463086474, 'tam'),
 (0.18335848900092241, 'tel'),
 (0.10543904256506387, 'urd'),
 (0.11996591850140624, 'eng')]

### DONE

In [126]:
print("Logistic Regression F1 score: ", f1_score(y_test, pred_lr,average='weighted'))
print("Naive Bayes F1 score: ", f1_score(y_test, pred_nb,average='weighted'))
print("SVM F1 score: ", f1_score(y_test, pred_SVM,average='weighted'))
print("Decission Tree Classifier F1 score: ",f1_score(y_test, pred_DTC,average='weighted'))
print("XGBClassifier F1 score: ", f1_score(y_test, pred_xgb, average='weighted'))
print("LSTM F1 score: ", f1_score(test_gru, pred_gru,average='weighted'))
print()
print("Logistic Regression Accuracy: ", accuracy_score(y_test, pred_lr))
print("Naive Bayes Accuracy: ", accuracy_score(y_test, pred_nb))
print("SVM Accuracy: ", accuracy_score(y_test, pred_SVM))
print("Decission Tree Classifier Accuracy: ",accuracy_score(y_test, pred_DTC))
print("XGBClassifier F1 score: ", accuracy_score(y_test, pred_xgb))
print("LSTM Accuracy: ", accuracy_score(test_gru, pred_gru))
print()
print("Logistic Regression MSE: ", mean_squared_error(y_test, pred_lr))
print("Naive Bayes MSE: ", mean_squared_error(y_test, pred_nb))
print("SVM MSE: ", mean_squared_error(y_test, pred_SVM))
print("Decission Tree Classifier MSE: ",mean_squared_error(y_test, pred_DTC))
print("XGBClassifier MSE: ", mean_squared_error(y_test, pred_xgb))
print("LSTM MSE: ", mean_squared_error(test_gru, pred_gru))

Logistic Regression F1 score:  0.9680349579067693
Naive Bayes F1 score:  0.9767645507998631
SVM F1 score:  0.9691697667143868
Decission Tree Classifier F1 score:  0.8520085722918403
XGBClassifier F1 score:  0.9020493021687092
LSTM F1 score:  0.9656513890863466

Logistic Regression Accuracy:  0.9679555084745762
Naive Bayes Accuracy:  0.9769597457627118
SVM Accuracy:  0.9690148305084746
Decission Tree Classifier Accuracy:  0.848781779661017
XGBClassifier F1 score:  0.897510593220339
LSTM Accuracy:  0.965572033898305

Logistic Regression MSE:  0.816207627118644
Naive Bayes MSE:  0.5238347457627118
SVM MSE:  0.7915783898305084
Decission Tree Classifier MSE:  4.112023305084746
XGBClassifier MSE:  2.220074152542373
LSTM MSE:  0.8442796610169492


In [131]:
print("v1 = (SVM + LR + NV + LSTM)")
print("emsemble_v1 F1 score: ", f1_score(y_test, pred_emsemble_v1, average='weighted'))
print("emsemble_v1 Accuracy: ", accuracy_score(y_test, pred_emsemble_v1))
print("emsemble_v1 MSE: ", mean_squared_error(y_test, pred_emsemble_v1))
print("v2 = (LR + NV + LSTM)")
print("emsemble_v2 F1 score: ", f1_score(y_test, pred_emsemble_v2, average='weighted'))
print("emsemble_v2 Accuracy: ", accuracy_score(y_test, pred_emsemble_v2))
print("emsemble_v2 MSE: ", mean_squared_error(y_test, pred_emsemble_v2))
print("v3 = (LR + NV + LSTM)")
print("emsemble_v3 F1 score: ", f1_score(y_test, pred_emsemble_v3,average='weighted'))
print("emsemble_v3 Accuracy: ", accuracy_score(y_test, pred_emsemble_v3))
print("emsemble_v3 MSE: ", mean_squared_error(y_test, pred_emsemble_v3))

v1 = (SVM + LR + NV + LSTM)
emsemble_v1 F1 score:  0.9870164293079388
emsemble_v1 Accuracy:  0.9870233050847458
emsemble_v1 MSE:  0.4059851694915254
v2 = (LR + NV + LSTM)
emsemble_v2 F1 score:  0.9896687963030091
emsemble_v2 Accuracy:  0.9896716101694916
emsemble_v2 MSE:  0.3061440677966102
v3 = (LR + NV + LSTM)
emsemble_v3 F1 score:  0.9907243664756155
emsemble_v3 Accuracy:  0.9907309322033898
emsemble_v3 MSE:  0.3003177966101695
