# Gaussian Mixture Model (EM Algo) for Language Classification

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Imports

In [None]:
from sklearn.mixture import GaussianMixture as GM
from sklearn import metrics
import matplotlib.pyplot as plt

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
path = '/content/drive/MyDrive/OP/'

In [None]:
lang_dict = {0:'Assam', 1:'Bengali', 2:'English', 3:'Gujarati', 4:'Hindi', 
             5:'Kannada', 6:'Malayalam', 7:'Marati', 8:'Odissa', 9:'Punjabi', 10:'Tamil', 11:'Telugu'}

In [None]:
languages = sorted(os.listdir(path))
print(languages)
languages_path = [path+i for i in languages]

['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'odi', 'pun', 'tam', 'tel']


### Data Processing (Reading data language wise)

In [None]:
def read_data(path):
    subpath = [path+'/PB_train_CSV/', path+'/PB_test_CSV/', path+'/YT_test_CSV/'] 
    data = []
    for p in subpath:
        temp = []
        for files in os.listdir(p):
            df = pd.read_csv(p+files, delimiter=',', header=None, encoding='utf-16')
            temp.append(np.array(df))
        data.append(temp)

    return data[0], data[1], data[2]

In [None]:
def concat(data):
    concat_data = np.empty([0,39])
    for i in data:
        n = np.array(i)
        concat_data = np.concatenate((concat_data,n),axis=0)
    return concat_data

### Getting data for all classes

In [None]:
asm_train, asm_pbtest, asm_yttest = read_data(languages_path[0])
asm_train = concat(asm_train)

In [None]:
ben_train, ben_pbtest, ben_yttest = read_data(languages_path[1])
ben_train = concat(ben_train)

In [None]:
eng_train, eng_pbtest, eng_yttest = read_data(languages_path[2])
eng_train = concat(eng_train)

In [None]:
guj_train, guj_pbtest, guj_yttest = read_data(languages_path[3])
guj_train = concat(guj_train)

In [None]:
hin_train, hin_pbtest, hin_yttest = read_data(languages_path[4])
hin_train = concat(hin_train)

In [None]:
kan_train, kan_pbtest, kan_yttest = read_data(languages_path[5])
kan_train = concat(kan_train)

In [None]:
mal_train, mal_pbtest, mal_yttest = read_data(languages_path[6])
mal_train = concat(mal_train)

In [None]:
mar_train, mar_pbtest, mar_yttest = read_data(languages_path[7])
mar_train = concat(mar_train)

In [None]:
odi_train, odi_pbtest, odi_yttest = read_data(languages_path[8])
odi_train = concat(odi_train)

In [None]:
pun_train, pun_pbtest, pun_yttest = read_data(languages_path[9])
pun_train = concat(pun_train)

In [None]:
tam_train, tam_pbtest, tam_yttest = read_data(languages_path[10])
tam_train = concat(tam_train)

In [None]:
tel_train, tel_pbtest, tel_yttest = read_data(languages_path[11])
tel_train = concat(tel_train)

### Building GMM Classifier

#### Class1: Assam

In [None]:
asm = GM(n_components=12, init_params='kmeans', covariance_type='full')
asm.fit(asm_train)
print('WEIGHTS:', asm.weights_)
# print('MEAN:', asm.means_)
# print('COVARIANCE:', asm.covariances_)

WEIGHTS: [0.07414468 0.08044657 0.09314097 0.08007877 0.0871309  0.06500468
 0.09176783 0.10665795 0.08775891 0.05726516 0.09197712 0.08462646]


#### Class2: Bengali

In [None]:
ben = GM(n_components=12, init_params='kmeans', covariance_type='full')
ben.fit(ben_train)
print('WEIGHTS:', ben.weights_)

WEIGHTS: [0.07470776 0.0268025  0.12767083 0.06273096 0.0909247  0.08215092
 0.14557509 0.08303867 0.01916183 0.12031551 0.0640438  0.10287742]


#### Class3: English

In [None]:
eng = GM(n_components=12, init_params='kmeans', covariance_type='full')
eng.fit(eng_train)
print('WEIGHTS:', eng.weights_)

WEIGHTS: [0.0958022  0.0813357  0.06855723 0.08067781 0.08642012 0.09522262
 0.09060009 0.0615536  0.07794277 0.10389014 0.07274649 0.08525122]


#### Class4: Gujarati

In [None]:
guj = GM(n_components=12, init_params='kmeans', covariance_type='full')
guj.fit(guj_train)
print('WEIGHTS:', guj.weights_)

WEIGHTS: [0.07832254 0.07465855 0.106818   0.11483145 0.073702   0.09226537
 0.09062995 0.09044096 0.07032203 0.08109111 0.0428053  0.08411274]


#### Class5: Hindi

In [None]:
hin = GM(n_components=12, init_params='kmeans', covariance_type='full')
hin.fit(hin_train)
print('WEIGHTS:', hin.weights_)

WEIGHTS: [0.0867731  0.05606412 0.06756456 0.0866793  0.07457238 0.06830072
 0.0955266  0.08269377 0.09883419 0.10250816 0.08067697 0.09980614]


#### Class6: Kannada

In [None]:
kan = GM(n_components=12, init_params='kmeans', covariance_type='full')
kan.fit(kan_train)
print('WEIGHTS:', kan.weights_)


WEIGHTS: [0.08537152 0.02765679 0.04613874 0.07492482 0.0864914  0.11103914
 0.0899901  0.0400843  0.08668181 0.11241869 0.1530498  0.08615289]


#### Class7: Malayalam

In [None]:
mal = GM(n_components=12, init_params='kmeans', covariance_type='full')
mal.fit(mal_train)
print('WEIGHTS:', mal.weights_)

WEIGHTS: [0.09961828 0.12695812 0.05129596 0.09963835 0.07541835 0.07790185
 0.08770995 0.08959321 0.08642869 0.03261362 0.07241812 0.10040552]


#### Class8: Marathi

In [None]:
mar = GM(n_components=12, init_params='kmeans', covariance_type='full')
mar.fit(mar_train)
print('WEIGHTS:', mar.weights_)

WEIGHTS: [0.07228055 0.06142795 0.07032327 0.03337415 0.07410289 0.09869111
 0.18200693 0.0840578  0.02543011 0.10504195 0.10283658 0.09042669]


#### Class9: Odissa

In [None]:
odi = GM(n_components=12, init_params='kmeans', covariance_type='full')
odi.fit(odi_train)
print('WEIGHTS:', odi.weights_)

WEIGHTS: [0.09462599 0.04857123 0.10642863 0.08587826 0.08186376 0.02653596
 0.08653043 0.10169366 0.09229379 0.06069399 0.10017221 0.1147121 ]


#### Class10: Punjabi

In [None]:
pun = GM(n_components=12, init_params='kmeans', covariance_type='full')
pun.fit(pun_train)
print('WEIGHTS:', pun.weights_)

WEIGHTS: [0.06520374 0.09675824 0.10835317 0.10622424 0.06118187 0.06531003
 0.06542408 0.09258382 0.09191911 0.07700941 0.06224559 0.10778669]


#### Class11: Tamil

In [None]:
tam = GM(n_components=12, init_params='kmeans', covariance_type='full')
tam.fit(tam_train)
print('WEIGHTS:', tam.weights_)

WEIGHTS: [0.07677315 0.10781537 0.14515359 0.08641939 0.11094667 0.07812263
 0.06303361 0.05398088 0.05667461 0.05222958 0.09314521 0.0757053 ]


#### Class12: Telugu

In [None]:
tel = GM(n_components=12, init_params='kmeans', covariance_type='full')
tel.fit(tel_train)
print('WEIGHTS:', tel.weights_)

WEIGHTS: [0.07675864 0.04518806 0.06185677 0.13373048 0.11974036 0.07285423
 0.08737165 0.07432711 0.08725677 0.06414296 0.0879655  0.08880749]


### Testing

In [None]:
def detect(val):
    lang_dict = {0:'Assam', 1:'Bengali', 2:'English', 3:'Gujarati', 4:'Hindi', 
             5:'Kannada', 6:'Malayalam', 7:'Marati', 8:'Odissa', 9:'Punjabi', 10:'Tamil', 11:'Telugu'}
    return lang_dict[val]

In [None]:
def get_scores(lang, test):
    return np.mean(lang.score_samples(test))

In [None]:
def get_gt(c, n):
    return [c for i in range(n)]

In [None]:
def system_predict(dataset):
    res = []
    for i in dataset:
        lh = [get_scores(asm, i), get_scores(ben, i), get_scores(eng, i),
             get_scores(guj, i), get_scores(hin, i), get_scores(kan, i),
             get_scores(mal, i), get_scores(mar, i), get_scores(odi, i),
             get_scores(pun, i), get_scores(tam, i), get_scores(tel, i)]

        res.append(np.argmax(lh))
        

    #gt = np.zeros(len(res))
    #print('Most language detected as: ', detect(res[0]))
    return res

#### PB Test Data

In [None]:
predicted_PB_asm = system_predict(asm_pbtest)
predicted_PB_ben = system_predict(ben_pbtest)
predicted_PB_eng = system_predict(eng_pbtest)
predicted_PB_guj = system_predict(guj_pbtest)
predicted_PB_hin = system_predict(hin_pbtest)
predicted_PB_kan = system_predict(kan_pbtest)
predicted_PB_mal = system_predict(mal_pbtest)
predicted_PB_mar = system_predict(mar_pbtest)
predicted_PB_odi = system_predict(odi_pbtest)
predicted_PB_pun = system_predict(pun_pbtest)
predicted_PB_tam = system_predict(tam_pbtest)
predicted_PB_tel = system_predict(tel_pbtest)

In [None]:
gt_PB_asm = get_gt(0, len(predicted_PB_asm))
gt_PB_ben = get_gt(1, len(predicted_PB_ben))
gt_PB_eng = get_gt(2, len(predicted_PB_eng))
gt_PB_guj = get_gt(3, len(predicted_PB_guj))
gt_PB_hin = get_gt(4, len(predicted_PB_hin))
gt_PB_kan = get_gt(5, len(predicted_PB_kan))
gt_PB_mal = get_gt(6, len(predicted_PB_mal))
gt_PB_mar = get_gt(7, len(predicted_PB_mar))
gt_PB_odi = get_gt(8, len(predicted_PB_odi))
gt_PB_pun = get_gt(9, len(predicted_PB_pun))
gt_PB_tam = get_gt(10, len(predicted_PB_tam))
gt_PB_tel = get_gt(11, len(predicted_PB_tel))

In [None]:
predicted_PB = predicted_PB_asm + predicted_PB_ben + predicted_PB_eng + predicted_PB_guj + predicted_PB_hin + predicted_PB_kan + predicted_PB_mal + predicted_PB_mar + predicted_PB_odi + predicted_PB_pun + predicted_PB_tam + predicted_PB_tel
gt_PB = gt_PB_asm + gt_PB_ben + gt_PB_eng + gt_PB_guj + gt_PB_hin + gt_PB_kan + gt_PB_mal + gt_PB_mar + gt_PB_odi + gt_PB_pun + gt_PB_tam + gt_PB_tel

In [None]:
confusion_matrix = metrics.confusion_matrix(gt_PB, predicted_PB, labels=list(lang_dict.keys()))
print(confusion_matrix)

[[340   0   7   0   0   0   0   0   3   0   5   4]
 [  4 169   0   0   2   1   0   0   3   0   0   0]
 [  5   0 103   0   4   1   0   0   3   0   0   0]
 [  1   0   0 175   0   0   0   0   0   0   1   2]
 [  1   2  10   3 155   0   0   4   0   1   0   3]
 [  1   1  10   4   0 170   1   0   7   0   1   2]
 [  8   1   4   2   1   0 165   0  11   0   0   4]
 [ 19   0   0   0   0   0   6  89   2   0   0   1]
 [  1   2   1   0   1   0   2   1 187   0   0   4]
 [  0   0   0  59   5   0   0   0   0  60   0   0]
 [  0   3   0  16   0   1   1   0   4   0  97   3]
 [  2   1   1   4   2   0   3   0   3   0   1 177]]


In [None]:
report_PB = metrics.classification_report(predicted_PB,gt_PB) #precision, recall, f1-score,etc
print(report_PB)

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       382
           1       0.94      0.94      0.94       179
           2       0.89      0.76      0.82       136
           3       0.98      0.67      0.79       263
           4       0.87      0.91      0.89       170
           5       0.86      0.98      0.92       173
           6       0.84      0.93      0.88       178
           7       0.76      0.95      0.84        94
           8       0.94      0.84      0.89       223
           9       0.48      0.98      0.65        61
          10       0.78      0.92      0.84       105
          11       0.91      0.89      0.90       200

    accuracy                           0.87      2164
   macro avg       0.85      0.89      0.86      2164
weighted avg       0.89      0.87      0.87      2164



In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(gt_PB, predicted_PB)*100

87.1996303142329

#### YT Test Data

In [None]:
predicted_YT_asm = system_predict(asm_yttest)
predicted_YT_ben = system_predict(ben_yttest)
predicted_YT_eng = system_predict(eng_yttest)
predicted_YT_guj = system_predict(guj_yttest)
predicted_YT_hin = system_predict(hin_yttest)
predicted_YT_kan = system_predict(kan_yttest)
predicted_YT_mal = system_predict(mal_yttest)
predicted_YT_mar = system_predict(mar_yttest)
predicted_YT_odi = system_predict(odi_yttest)
predicted_YT_pun = system_predict(pun_yttest)
predicted_YT_tam = system_predict(tam_yttest)
predicted_YT_tel = system_predict(tel_yttest)

In [None]:
gt_YT_asm = get_gt(0, len(predicted_YT_asm))
gt_YT_ben = get_gt(1, len(predicted_YT_ben))
gt_YT_eng = get_gt(2, len(predicted_YT_eng))
gt_YT_guj = get_gt(3, len(predicted_YT_guj))
gt_YT_hin = get_gt(4, len(predicted_YT_hin))
gt_YT_kan = get_gt(5, len(predicted_YT_kan))
gt_YT_mal = get_gt(6, len(predicted_YT_mal))
gt_YT_mar = get_gt(7, len(predicted_YT_mar))
gt_YT_odi = get_gt(8, len(predicted_YT_odi))
gt_YT_pun = get_gt(9, len(predicted_YT_pun))
gt_YT_tam = get_gt(10, len(predicted_YT_tam))
gt_YT_tel = get_gt(11, len(predicted_YT_tel))

In [None]:
# gt

In [None]:
predicted_YT = predicted_YT_asm + predicted_YT_ben + predicted_YT_eng + predicted_YT_guj + predicted_YT_hin + predicted_YT_kan + predicted_YT_mal + predicted_YT_mar + predicted_YT_odi + predicted_YT_pun + predicted_YT_tam + predicted_YT_tel
gt_YT = gt_YT_asm + gt_YT_ben + gt_YT_eng + gt_YT_guj + gt_YT_hin + gt_YT_kan + gt_YT_mal + gt_YT_mar + gt_YT_odi + gt_YT_pun + gt_YT_tam + gt_YT_tel

In [None]:
confusion_matrix = metrics.confusion_matrix(gt_YT, predicted_YT, labels=list(lang_dict.keys()))
print(confusion_matrix)

[[29 18 22  0  0  2 37  0 70  0  0  2]
 [ 4 11 41  0  2 18 36 15 43  0  0 10]
 [ 4  4 15  0  0 38 27  0 37  0  0  1]
 [ 2  0 22  0 23 20 73 19 10  0  4  8]
 [ 1  7  1  0  0 26 68  0 62  0  0 16]
 [20 13 34  0  6 16 30 13 49  0  0  0]
 [ 3 36  7 25 13 19  8 19 22  3 18  7]
 [ 5  1 11  0  0  1 47  6 31  0  0 17]
 [29 22 28  0  0  0  7 31 55  0  0  8]
 [16  0  4  0  3  0 17 12 68  0  0  1]
 [ 4  7  2  3 25  0 13  0 39  5 18  1]
 [ 2 13  3 15  4 16 33 27 40  0 15 10]]


In [None]:
report_YT = metrics.classification_report(predicted_YT,gt_YT) #precision, recall, f1-score,etc
print(report_YT)

              precision    recall  f1-score   support

           0       0.16      0.24      0.19       119
           1       0.06      0.08      0.07       132
           2       0.12      0.08      0.09       190
           3       0.00      0.00      0.00        43
           4       0.00      0.00      0.00        76
           5       0.09      0.10      0.09       156
           6       0.04      0.02      0.03       396
           7       0.05      0.04      0.05       142
           8       0.31      0.10      0.16       526
           9       0.00      0.00      0.00         8
          10       0.15      0.33      0.21        55
          11       0.06      0.12      0.08        81

    accuracy                           0.09      1924
   macro avg       0.09      0.09      0.08      1924
weighted avg       0.14      0.09      0.09      1924



In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(gt_YT, predicted_YT)*100

8.731808731808732

**For prasharbharti**

In [None]:
print("Accuracy of asm",accuracy_score(gt_PB_asm, predicted_PB_asm))
print("Accuracy of eng",accuracy_score(gt_PB_eng, predicted_PB_eng))
print("Accuracy of guj",accuracy_score(gt_PB_guj, predicted_PB_guj))
print("Accuracy of hin",accuracy_score(gt_PB_hin, predicted_PB_hin))
print("Accuracy of kan",accuracy_score(gt_PB_kan, predicted_PB_kan))
print("Accuracy of mal",accuracy_score(gt_PB_mal, predicted_PB_mal))
print("Accuracy of mar",accuracy_score(gt_PB_mar, predicted_PB_mar))
print("Accuracy of odi",accuracy_score(gt_PB_odi, predicted_PB_odi))
print("Accuracy of pun",accuracy_score(gt_PB_pun, predicted_PB_pun))
print("Accuracy of tam",accuracy_score(gt_PB_tam, predicted_PB_tam))
print("Accuracy of tel",accuracy_score(gt_PB_tel, predicted_PB_tel))

Accuracy of asm 0.947075208913649
Accuracy of eng 0.8879310344827587
Accuracy of guj 0.9776536312849162
Accuracy of hin 0.8659217877094972
Accuracy of kan 0.8629441624365483
Accuracy of mal 0.8418367346938775
Accuracy of mar 0.7606837606837606
Accuracy of odi 0.9396984924623115
Accuracy of pun 0.4838709677419355
Accuracy of tam 0.776
Accuracy of tel 0.9123711340206185


**For Youtube**

In [None]:
print("Accuracy of asm",accuracy_score(gt_YT_asm, predicted_YT_asm))
print("Accuracy of ben",accuracy_score(gt_YT_ben, predicted_YT_ben))
print("Accuracy of eng",accuracy_score(gt_YT_eng, predicted_YT_eng))
print("Accuracy of guj",accuracy_score(gt_YT_guj, predicted_YT_guj))
print("Accuracy of hin",accuracy_score(gt_YT_hin, predicted_YT_hin))
print("Accuracy of kan",accuracy_score(gt_YT_kan, predicted_YT_kan))
print("Accuracy of mal",accuracy_score(gt_YT_mal, predicted_YT_mal))
print("Accuracy of mar",accuracy_score(gt_YT_mar, predicted_YT_mar))
print("Accuracy of odi",accuracy_score(gt_YT_odi, predicted_YT_odi))
print("Accuracy of pun",accuracy_score(gt_YT_pun, predicted_YT_pun))
print("Accuracy of tam",accuracy_score(gt_YT_tam, predicted_YT_tam))
print("Accuracy of tel",accuracy_score(gt_YT_tel, predicted_YT_tel))


Accuracy of asm 0.16111111111111112
Accuracy of ben 0.06111111111111111
Accuracy of eng 0.11904761904761904
Accuracy of guj 0.0
Accuracy of hin 0.0
Accuracy of kan 0.08839779005524862
Accuracy of mal 0.044444444444444446
Accuracy of mar 0.05042016806722689
Accuracy of odi 0.3055555555555556
Accuracy of pun 0.0
Accuracy of tam 0.15384615384615385
Accuracy of tel 0.056179775280898875
