In [None]:
from google.colab import drive

drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
#Importing the dataset
import pandas as pd

df = pd.read_csv('/content/gdrive/My Drive/Data/sentences.csv',
                            sep='\t', 
                            encoding='utf8', 
                            index_col=0,
                            names=['lang','text'])

In [None]:
filt = [True if 40<=len(s)<=500 else False for s in df['text']]
df = df[filt]

# We will train this on only 8 languages - 
lang_filter = ['deu', 'eng', 'fra', 'ita', 'por', 'spa', 'ara', 'ben']

def clean_data2(data,langlist):
  '''
  To filter the sentences only from lang_filter
  '''
  data = data.loc[data['lang'].isin(langlist)]
  return data

df_new = clean_data2(df,lang_filter)

#Trimming the dataset 
n = 10000
df_red = df_new.groupby('lang').apply(lambda x: x.sample(min(n,len(x)))).reset_index(drop=True)

In [None]:
from sklearn.model_selection import train_test_split

train, test =  train_test_split(df_red, test_size=0.20, random_state=42)
valid, test = train_test_split(test, test_size=0.15, random_state=30)

print('Train shape: ',train.shape)
print('Valid shape: ',valid.shape)
print('Test shape: ',test.shape)

Train shape:  (51940, 2)
Valid shape:  (11038, 2)
Test shape:  (1948, 2)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Finding the bigrams and creating a feature set to creat the training vocabulary

def bigram(text,n_feat):
    vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(2,2),max_features=n_feat)
    
    X = vectorizer.fit_transform(text)
    feature_names = vectorizer.get_feature_names()    
    return feature_names

features = {}
features_set = set()

for l in lang_filter:
    corpus = train[train.lang==l]['text']
    bigrams = bigram(corpus,250)
    features[l] = bigrams
    features_set.update(bigrams)

# Vocab created to feed into the Count Vectorizer for training data
vocab = dict()
for i,feat in enumerate(features_set):
    vocab[feat]=i
print('Vocab')
print(vocab)

Vocab
{'রব': 0, 'y.': 1, 'w ': 2, 'i,': 3, 'c ': 4, 'إل': 5, 'ed': 6, 'ট ': 7, 'েষ': 8, 'ঙ্': 9, 'hu': 10, ' ছ': 11, 'un': 12, ' r': 13, 'بة': 14, 'رو': 15, 'يو': 16, 'ul': 17, 'لي': 18, 'ej': 19, 's.': 20, 'هي': 21, 'wu': 22, 'كث': 23, 'عي': 24, 'ré': 25, 'oy': 26, '، ': 27, 'া।': 28, ' à': 29, 'ন?': 30, 'مت': 31, 'ُ ': 32, ' m': 33, 'h,': 34, 'فر': 35, 'لو': 36, 'লো': 37, 'ei': 38, 'له': 39, 'ér': 40, 'চা': 41, 'ow': 42, 'oa': 43, 'কি': 44, 'z ': 45, 'sc': 46, 'োম': 47, 'াট': 48, 'াঁ': 49, 'rt': 50, 'ا.': 51, 'كي': 52, 'খা': 53, 'ণ ': 54, 'াছ': 55, ' ك': 56, ' u': 57, 'wh': 58, "'s": 59, 'nh': 60, 'عم': 61, 'ছা': 62, 'rò': 63, 'ال': 64, 'تر': 65, 'ন ': 66, 'ep': 67, 'hö': 68, 'ls': 69, ' ন': 70, 'fa': 71, 'wo': 72, 'nt': 73, 'نّ': 74, 'de': 75, 'tà': 76, 'س ': 77, 'মা': 78, 'eb': 79, '"i': 80, 'tu': 81, 'll': 82, '্ষ': 83, 'am': 84, 'كر': 85, 'ju': 86, ' ঘ': 87, 'má': 88, 'tw': 89, 'াব': 90, 'টে': 91, 'rg': 92, 'ية': 93, ' প': 94, 'نه': 95, 'اف': 96, ' ش': 97, 'ء ': 98, 'ت ': 99, 'e?

In [None]:
vectorizer = CountVectorizer(analyzer='char',
                             ngram_range=(2, 2),
                            vocabulary=vocab)

# Creating the train feature matrix 
data_x = train['text']   
X = vectorizer.fit_transform(data_x)
feature_names = vectorizer.get_feature_names()

train_features = pd.DataFrame(data=X.toarray(),columns=feature_names)

In [None]:
print(train_features)
'''
The columns in train_feat tells you about list of bigrams and the rows represent the sentences 
The zeros and ones indicate the presence or absence of the particular bigram in the sentence
'''

       রব  y.  w   i,  c   إل  ed  ট   েষ  ...  ek  zi   ص  হব  িত   è  ê   mm  ে।
0       0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
1       0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
2       0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
3       0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   1   0
4       0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..  ..  ..  ..
51935   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
51936   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
51937   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0
51938   0   0   0   0   0   0   1   0   0  ...   0   0   0   0   0   0   0   0   0
51939   0   0   0   0   0   0   0   0   0  ...   0   0   0   0   0   0   0   0   0

[51

'\nThe columns in train_feat tells you about list of bigrams and the rows represent the sentences \nThe zeros and ones indicate the presence or absence of the particular bigram in the sentence\n'

In [None]:
def normalized(train_features, feat):
  tr_min = train_features.min()
  tr_max = train_features.max()
  feat = (feat - tr_min)/(tr_max-tr_min)
  return feat

train_features_norm = normalized(train_features,train_features)

#Add target variable 
train_features_norm['lang_target'] = list(train['lang'])

# Data Processing for the test and validation set

valid_text = valid['text']   
X = vectorizer.fit_transform(valid_text)

valid_features = pd.DataFrame(data=X.toarray(),columns=feature_names)
valid_features = normalized(train_features,valid_features)
valid_features['lang_target'] = list(valid['lang'])


test_text = test['text']   
X = vectorizer.fit_transform(test_text)

test_features = pd.DataFrame(data=X.toarray(),columns=feature_names)
test_features = normalized(train_features, test_features)
test_features['lang_target'] = list(test['lang'])

In [None]:
print(test_features.shape)
print(valid_features.shape)

(1948, 975)
(11038, 975)


In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

lang_filter = ['deu', 'eng', 'fra', 'ita', 'por', 'spa', 'ara', 'ben']

encoder = LabelEncoder()
encoder.fit(lang_filter)

X_train = train_features_norm.drop('lang_target',axis = 1)
Y_train = np_utils.to_categorical(encoder.transform(train_features_norm['lang_target']))

X_val = valid_features.drop('lang_target',axis = 1)
Y_val = np_utils.to_categorical(encoder.transform(valid_features['lang_target']))

X_test = test_features.drop('lang_target',axis = 1)
Y_test = test_features['lang_target']
Y_test = np_utils.to_categorical(encoder.transform(test_features['lang_target']))

print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)
print(X_test.shape,Y_test.shape)

(51940, 974) (51940, 8)
(11038, 974) (11038, 8)
(1948, 974) (1948, 8)


In [None]:
import numpy as np
X_train = np.expand_dims(X_train, axis=1)
X_val = np.expand_dims(X_val,axis=1)
X_test = np.expand_dims(X_test, axis=1)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(51940, 1, 974)
(11038, 1, 974)
(1948, 1, 974)


In [56]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
import keras

def evaluate_model(trainX, trainy, testX, testy):
  verbose, epochs, batch_size = 1, 10, 32
  n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainy.shape[1]
  model = Sequential()
  model.add(Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(n_timesteps,n_features)))
  model.add(Dropout(0.1))
  model.add(Conv1D(filters=64, kernel_size=1, activation='relu'))
  model.add(Flatten())
  model.add(Dense(100, activation='relu'))
  model.add(Dense(n_outputs, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, validation_data=(X_val, Y_val), verbose=verbose)
  _, accuracy = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=0)
  return accuracy

def summarize_results(scores):
	print(scores)
 

def run_experiment(repeats,trainX,trainy,testX,testy):
	scores = list()
	for r in range(repeats):
		score = evaluate_model(trainX, trainy, testX, testy)
		score = score * 100.0
		print('>#%d: %.3f' % (r+1, score))
		scores.append(score)
	summarize_results(scores)

In [57]:
run_experiment(10,X_train,Y_train,X_test,Y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>#1: 99.333
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>#2: 99.487
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>#3: 99.435
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>#4: 99.538
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>#5: 99.333
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>#6: 99.384
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>#7: 99.384
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>#8: 99.589
Epoch 1/10
Epoch

Accuracy of CNN :

* mean: 99.4353187084198
* std: 0.10767958798933588