In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB # best
from sklearn.linear_model import LogisticRegression
import numpy as np
import random
import os
from tqdm import tqdm

In [2]:
df_test = pd.read_csv('icdc\english translation\english-translation.csv')
df = pd.read_csv(r'icdc\train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ben     1700 non-null   object
 1   guj     1700 non-null   object
 2   hin     1700 non-null   object
 3   kan     1700 non-null   object
 4   mal     1700 non-null   object
 5   ori     1700 non-null   object
 6   pan     1700 non-null   object
 7   tam     1700 non-null   object
 8   tel     1700 non-null   object
 9   urd     1700 non-null   object
 10  eng     1700 non-null   object
dtypes: object(11)
memory usage: 146.2+ KB


In [3]:
LANGS = ['ben', 'hin', 'pan', 'tam', 'tel']

In [4]:
dfs=[]
for i,col_name in enumerate(df.columns):
    if col_name in LANGS:
        df2=pd.DataFrame({'Comment':df[col_name],'Language_Index': LANGS.index(col_name), 'Language': col_name})
        dfs.append(df2)

result_df = pd.concat(dfs, ignore_index=True)   
print(len(dfs)) 

X=result_df['Comment']
Y=result_df['Language_Index']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=26)

5


In [5]:
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error

In [6]:
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [7]:
lr_classifier = LogisticRegression(max_iter=300)
lr_classifier.fit(X_train_vect, y_train)
pred_lr = lr_classifier.predict(X_test_vect)

print("Accuracy:", accuracy_score(y_test, lr_classifier.predict(X_test_vect)))

Accuracy: 0.981764705882353


In [8]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vect, y_train)
pred_nb = nb_classifier.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, pred_nb))

Accuracy: 0.9905882352941177


In [9]:
# NORMALIZE and MAKE between 0 and 1
def prob(arr:np.ndarray, gap_adjuster:int=3)->np.ndarray:
    if len(arr.shape) == 1:
        arr = (arr-arr.min())/(arr.max()-arr.min())
        if gap_adjuster!=1: arr = arr**gap_adjuster
        return arr/arr.sum()
    else:
        arr = (arr-arr.min(axis=1).reshape(-1, 1))/(arr.max(axis=1)-arr.min(axis=1)).reshape(-1, 1)
        if gap_adjuster!=1: arr = arr**gap_adjuster
        return arr/arr.sum(axis=1).reshape(-1, 1)
    

def emsemble_infer_v2(texts:str|list[str], printable=False):
    if isinstance(texts, str): texts = [texts]
    output = (
        prob(lr_classifier.predict_proba(vectorizer.transform(texts)), gap_adjuster=1) + 
        prob(nb_classifier.predict_proba(vectorizer.transform(texts)), gap_adjuster=1) #+
    ).argmax(axis=1)
    if printable:
        return [LANGS[i] for i in output.tolist()]
    else:
        return output
    
pred_emsemble_v2 = []
for i in tqdm(range(0, len(X_test), 64)):
    pred_emsemble_v2.append(emsemble_infer_v2(X_test[i:i+64]))

pred_emsemble_v2 = np.concatenate(pred_emsemble_v2, axis = 0)

print("F1 score: ", f1_score(y_test, pred_emsemble_v2, average='weighted'))
print("Accuracy: ", accuracy_score(y_test, pred_emsemble_v2))
print("MSE: ", mean_squared_error(y_test, pred_emsemble_v2))

100%|██████████| 27/27 [00:00<00:00, 84.59it/s]

F1 score:  0.9894222540839781
Accuracy:  0.9894117647058823
MSE:  0.05411764705882353





In [10]:
df_translitration=df_test['text'].tolist()

In [11]:
pred_emsemble_v2 = []
for i in tqdm(range(0, len(df_translitration), 64)):
    pred_emsemble_v2.append(emsemble_infer_v2(df_translitration[i:i+64]))

pred_emsemble_v2 = np.concatenate(pred_emsemble_v2, axis = 0)


100%|██████████| 13/13 [00:00<00:00, 164.56it/s]


In [12]:
df_test

Unnamed: 0,text
0,seta to khubi bhaal have!
1,m mase kono ullekhayogya tapapravaher dasha an...
2,ei sabkatai darun lagno shunate.
3,"tar prabandh, ya ki na tar agami bayer ekati u..."
4,"bartaman mammla, njity taar ekhtiar parityag k..."
...,...
795,"1997lo sanyo ""pioessiepi"" palimar tantelu chip..."
796,nenu i pradeshal gurinchi ippatike chala chadi...
797,"gurtunda, edadi kindat me kolig biknu kalcharu."
798,bharatvaesha gopp vaividhya-vyatyasabharit bhumi.


In [13]:
LANGS

['ben', 'hin', 'pan', 'tam', 'tel']

In [14]:
# Mapping of integers to labels
label_mapping = {
    idx:name for idx, name in enumerate(LANGS)
}

# Assuming `output` is the concatenated list of integers
labels = [label_mapping[i] for i in pred_emsemble_v2 ]

print(labels)

['ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben', 'ben'

In [15]:
len(labels)

800

In [16]:
df_test['Language'] = labels

In [17]:
df_test['Language_index']=pred_emsemble_v2

In [18]:
df_test

Unnamed: 0,text,Language,Language_index
0,seta to khubi bhaal have!,ben,0
1,m mase kono ullekhayogya tapapravaher dasha an...,ben,0
2,ei sabkatai darun lagno shunate.,ben,0
3,"tar prabandh, ya ki na tar agami bayer ekati u...",ben,0
4,"bartaman mammla, njity taar ekhtiar parityag k...",ben,0
...,...,...,...
795,"1997lo sanyo ""pioessiepi"" palimar tantelu chip...",tel,4
796,nenu i pradeshal gurinchi ippatike chala chadi...,tel,4
797,"gurtunda, edadi kindat me kolig biknu kalcharu.",tel,4
798,bharatvaesha gopp vaividhya-vyatyasabharit bhumi.,tel,4
