In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#Using ANN 


*   Trained on 8 languages
*   Before running download the sentences.csv from https://downloads.tatoeba.org/exports/

In [2]:
#Importing the dataset
import pandas as pd

df = pd.read_csv('/content/gdrive/My Drive/Data/sentences.csv',
                            sep='\t', 
                            encoding='utf8', 
                            index_col=0,
                            names=['lang','text'])




In [3]:
filt = [True if 40<=len(s)<=500 else False for s in df['text']]
df = df[filt]


In [4]:
# We will train this on only 8 languages - 
lang_filter = ['deu', 'eng', 'fra', 'ita', 'por', 'spa', 'ara', 'ben']

def clean_data2(data,langlist):
  data = data.loc[data['lang'].isin(langlist)]
  return data

df_new = clean_data2(df,lang_filter)

In [5]:
df_new

Unnamed: 0,lang,text
81,deu,Heute ist der 18. Juni und das ist der Geburts...
82,deu,"Herzlichen Glückwunsch zum Geburtstag, Muiriel!"
89,deu,"Ich weiß einfach nicht, was ich sagen soll."
94,deu,Aus irgendeinem Grund hat das Mikrofon gerade ...
96,deu,Die Ausbildung in dieser Welt enttäuscht mich.
...,...,...
9779378,eng,There's nothing that can't be bought with money.
9779379,eng,"I got up earlier than usual, so I could catch ..."
9779381,eng,People don't read novels as much as they used to.
9779392,por,Os garotos estavam completamente cobertos de l...


In [6]:
#Trimming the dataset 
n = 10000
df_red = df_new.groupby('lang').apply(lambda x: x.sample(min(n,len(x)))).reset_index(drop=True)

In [7]:
from sklearn.model_selection import train_test_split

train, test =  train_test_split(df_red, test_size=0.20, random_state=42)

In [8]:
valid, test = train_test_split(test, test_size=0.7, random_state=30)

In [9]:
print(train.shape)
print(valid.shape)
print(test.shape)
train.to_csv('/content/gdrive/My Drive/Data/train.csv')
valid.to_csv('/content/gdrive/My Drive/Data/valid.csv')
test.to_csv('/content/gdrive/My Drive/Data/test.csv')


(51940, 2)
(3895, 2)
(9091, 2)


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

def bigram(text,n_feat):
    vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(2,2),max_features=n_feat)
    
    X = vectorizer.fit_transform(text)
    feature_names = vectorizer.get_feature_names()    
    return feature_names

features = {}
features_set = set()

for l in lang_filter:
    corpus = train[train.lang==l]['text']
    bigrams = bigram(corpus,250)
    features[l] = bigrams
    features_set.update(bigrams)
    
print(features_set)

# Vocab created to feed into the Count Vectorizer for training data
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i

print(vocab)


{' t', 'عم', ' p', 'ro', 'بي', 'ُ ', 'ia', 'oi', 'ذي', "u'", 'tu', 'né', ' ر', 'mi', 'sn', 'اح', 'a?', 'ld', ' ন', 'se', 'fu', ' ل', ' à', 'op', 'ময', 'تق', ' e', 'াস', 'لع', 'কট', 'hé', 'سب', 'ct', ' c', 'wi', 'মধ', 'ly', ' ঘ', 'বা', 'لم', 'ই।', 'جم', 'in', 'e,', 'عد', 'তি', 'হল', "'é", ' ذ', ' ج', ' ص', 'لا', 'রো', 'হ ', 'ci', 'no', 'হয', 'ست', 'نس', 'tã', 'ত ', ' ح', 'y ', 's.', 'اه', 'دا', 'রি', 'rà', 'sc', 'a.', ' ফ', 'na', 'zu', 'ué', 'rt', 'াল', 've', 'قر', 'ep', 'ev', 'أو', 'اض', ' গ', 'পন', 'ب ', 'كي', 'za', ' ম', 'لر', 'hl', 'ام', 'kl', 'el', 'mu', 'lh', 'অপ', 'ম।', 'ol', 'ez', 'qu', 'تط', 'ذا', 'ó ', 'و ', 'ে ', 'েষ', 'o?', 'ze', 'হা', 'as', 'be', 'না', 'وي', 'az', 'ab', 'ap', 'اج', 'وا', 'ي.', 'ال', '"i', 'এখ', 'দি', 'ea', "e'", 'াঁ', 'lu', 'ুগ', 'ّة', 'াত', 'বর', 'ن.', 'if', 'াদ', 'gr', 'স ', ' ق', '? ', 'ষা', ' b', 'gh', 'از', 'د ', 'ুম', 'lm', 'ن ', 'nk', 'u ', 'ju', 'eh', 'أخ', ' ش', '। ', 'sp', 'كا', 'عة', 'حي', ' উ', 'طي', 'ße', 'س ', ' ছ', 'n ', 'ex', ' q', 'جد', 'n.

In [11]:
vectorizer = CountVectorizer(analyzer='char',
                             ngram_range=(2, 2),
                            vocabulary=vocab)

data_x = train['text']   
X = vectorizer.fit_transform(data_x)
feature_names = vectorizer.get_feature_names()

train_features = pd.DataFrame(data=X.toarray(),columns=feature_names)

In [12]:
print(train_features)
'''
The columns in train_feat tells you about list of bigrams and the rows represent the sentences 
The zeros and ones indicate the presence or absence of the particular bigram in the sentence
'''

        t  عم   p  ro  بي  ُ   ia  oi  ذي  ...   ع  iz  dn  يت  ّ   ক্  'a  nu  حق
0       0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
1       0   0   2   0   0   0   0   1   0  ...   0   0   0   0   0   0   1   0   0
2       0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
3       1   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
4       1   0   1   1   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..  ..  ..  ..
51935   0   0   0   1   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
51936   0   0   0   0   0   0   1   0   0  ...   0   0   0   0   0   0   0   0   0
51937   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   1   0   0   0   0
51938   2   0   1   1   0   0   0   1   0  ...   0   0   0   0   0   0   0   0   0
51939   0   0   1   1   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0

[51

'\nThe columns in train_feat tells you about list of bigrams and the rows represent the sentences \nThe zeros and ones indicate the presence or absence of the particular bigram in the sentence\n'

In [13]:
def normalized(train_features, feat):
  tr_min = train_features.min()
  tr_max = train_features.max()
  feat = (feat - tr_min)/(tr_max-tr_min)
  return feat

train_features_norm = normalized(train_features,train_features)

#Add target variable 
train_features_norm['lang_target'] = list(train['lang'])

train_features_norm

Unnamed: 0,t,عم,p,ro,بي,ُ,ia,oi,ذي,u',tu,né,ر,mi,sn,اح,a?,ld,ন,se,fu,ل,à,op,ময,تق,e,াস,لع,কট,hé,سب,ct,c,wi,মধ,ly,ঘ,বা,لم,...,কে,è,به,om,oe,ة.,لإ,j',vu,aç,لك,له,ن,cr,আ,an,ay,ok,োন,l,حد,ক,lc,pi,নে,v,mé,র্,fü,وق,ع,iz,dn,يت,ّ,ক্,'a,nu,حق,lang_target
0,0.000000,0.0,0.000000,0.000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.153846,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,ita
1,0.000000,0.0,0.071429,0.000,0.0,0.0,0.000000,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,fra
2,0.000000,0.0,0.000000,0.000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,0.00,0.0,0.0,0.0,0.0,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.076923,0.00,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,spa
3,0.076923,0.0,0.000000,0.000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222222,0.00,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.2,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,deu
4,0.076923,0.0,0.035714,0.125,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.25,0.00,0.0,0.0,0.0,0.0,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.230769,0.00,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,fra
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51935,0.000000,0.0,0.000000,0.125,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.00,0.00,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,spa
51936,0.000000,0.0,0.000000,0.000,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.000000,0.00,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.076923,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,ita
51937,0.000000,0.0,0.000000,0.000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,0.25,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.111111,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.00,0.0,0.0,ara
51938,0.153846,0.0,0.035714,0.125,0.0,0.0,0.000000,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.076923,0.25,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,eng


In [14]:
# Data Processing for the test and validation set

valid_text = valid['text']   
X = vectorizer.fit_transform(valid_text)

valid_features = pd.DataFrame(data=X.toarray(),columns=feature_names)
valid_features = normalized(train_features,valid_features)
valid_features['lang_target'] = list(valid['lang'])


test_text = test['text']   
X = vectorizer.fit_transform(test_text)

test_features = pd.DataFrame(data=X.toarray(),columns=feature_names)
test_features = normalized(train_features, test_features)
test_features['lang'] = list(test['lang'])

In [15]:
print(test_features.shape)
print(valid_features.shape)

(9091, 978)
(3895, 978)


In [16]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

lang_filter = ['deu', 'eng', 'fra', 'ita', 'por', 'spa', 'ara', 'ben']

encoder = LabelEncoder()
encoder.fit(lang_filter)

X_train = train_features_norm.drop('lang_target',axis = 1)
Y_train = np_utils.to_categorical(encoder.transform(train_features_norm['lang_target']))

X_val = valid_features.drop('lang_target',axis = 1)
Y_val = np_utils.to_categorical(encoder.transform(valid_features['lang_target']))

X_test = test_features.drop('lang',axis = 1)
Y_test = np_utils.to_categorical(encoder.transform(test_features['lang']))

print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)
print(X_test.shape,Y_test.shape)





(51940, 977) (51940, 8)
(3895, 977) (3895, 8)
(9091, 977) (9091, 8)


In [17]:
input_dim = X_train.shape[1]
print('Input Dim into Layer : ',input_dim)

Input Dim into Layer :  977


In [18]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
import keras

def fit_model(input_dim ,nodes,epochs,batch_size):
    model = Sequential()
    model.add(Dense(nodes[0], input_dim=input_dim, activation='relu'))
    model.add(Dense(nodes[1], activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(nodes[2], activation='relu'))
    model.add(Dense(8, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size)
    
    train_acc = model.evaluate(X_train, Y_train)
    val_acc = model.evaluate(X_val, Y_val)
    
    return round(train_acc[1]*100,2),round(val_acc[1]*100,2)

nodes = [[100,100,50],[200,200,100],[300,200,100],[500,500,250]]
epochs = [1,2,3,4]
batch_size = [10,100,1000]

results = []
i = 0

for n in nodes:
    print("MODEL: ", i)
    for e in epochs:
        for b in batch_size:
            result = {}
            
            result['model'] = i
            result['nodes'] = n
            result['epochs'] = e
            result['batch_size'] = b
            result['train'], result['valid'] = fit_model(input_dim,n,e,b)
            
            results.append(result)
            i+= 1

results_final = pd.DataFrame(results)

MODEL:  0
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
MODEL:  12
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
MODEL:  24
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
MODEL:  36
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3

In [19]:
print(results_final[results_final.valid == results_final.valid.max()])
print(results_final[results_final.valid>99.2])

    model            nodes  epochs  batch_size  train  valid
15     15  [200, 200, 100]       2          10   99.7  99.41
    model            nodes  epochs  batch_size  train  valid
6       6   [100, 100, 50]       3          10  99.78  99.23
15     15  [200, 200, 100]       2          10  99.70  99.41
19     19  [200, 200, 100]       3         100  99.72  99.33
28     28  [300, 200, 100]       2         100  99.74  99.26
33     33  [300, 200, 100]       4          10  99.88  99.26
44     44  [500, 500, 250]       3        1000  99.74  99.26


In [20]:
#Final Model
model = Sequential()
model.add(Dense(200, input_dim=input_dim, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(8, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=2, batch_size=100)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f07bc76af98>

In [21]:
from sklearn.metrics import accuracy_score

predicted = model.predict_classes(X_test)
Y_pred = encoder.inverse_transform(predicted)
Y_test = test_features['lang']
accuracy = accuracy_score(Y_test,Y_pred)
print(accuracy)




0.993180068199318


Accuracy of best performing ANN model : 99.31 %