**Import modules**

In [1]:
import pandas as pd
import tarfile
import os
import time
from sklearn.utils import shuffle
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from itertools import permutations
import collections
import string

import seaborn as sns
sns.set(style="ticks")

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import pickle

from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier

**Extract dataset archive**

In [25]:
languages = tarfile.open('./europarl.tgz')

In [26]:
languages.extractall()

In [27]:
labels = {
    'German':'de',
    'English': 'en',
    'Spanish':'es',
    'French':'fr',
    'Hungarian':'hu',
    'Romanian':'ro'
}

**Convert txt to csv**

In [29]:
path = 'txt'
directories = os.listdir(path)

In [58]:
start_time = time.time()
print('Converting txt to csv begun')
for lang in directories:
    print('Converting '+lang+' to csv begun')
    paths = path + '/' + lang
    files = os.listdir(paths)
    lang_type = [lang]
    lang_type = open(lang+'.csv','w')
    for file in files:
        try:
            with open(paths+'/'+file, 'r', encoding = 'utf-8') as lines:
                for line in lines:
                    lang_type.write(line)
        except UnicodeDecodeError:
            with open(paths+'/'+file, 'r', encoding = 'unicode_escape') as lines:
                for line in lines:
                    lang_type.write(line)
        except UnicodeEncodeError:
            pass
    print("Conversion done \n")
                    
print("--- %s seconds ---" % (time.time() - start_time))
print("Converting from txt to csv done")

Converting txt to csv begun
Converting de to csv begun
Conversion done 

Converting en to csv begun
Conversion done 

Converting es to csv begun
Conversion done 

Converting fr to csv begun
Conversion done 

Converting hu to csv begun
Conversion done 

Converting ro to csv begun
Conversion done 

--- 292.29198837280273 seconds ---
Converting from txt to csv done


In [68]:
#creating empty dataframe
df = pd.DataFrame(columns = ['Sentences', 'Language_Type'])

**Concatenating all of the csv files + a bit of text formatting**

In [71]:
for label in labels.keys():
    print(label, labels[label],'\n')
    try:
        start_time = time.time()
        
        #loading data for individual languages
        data = pd.read_csv(labels[label] + '.csv', sep = 'delimiter', header = None,
                            engine = 'python', index_col = False, encoding = 'latin-1')
        data.columns = ['Sentences']
        data = data[~data.Sentences.str.contains("<")]
        characters = ['1','2','3','4','5','6','7','8','9','0', '@',"'",'#','$','%',
                    '&','/','(',')','"','.',',','?','/','!','=',':',';']
        
        print(label, 'Data loaded, Pre-processing begun')
        for character in characters:
            data.loc[:, 'Sentences'] = data.loc[:, 'Sentences'].str.replace(r''+character, 
                                                                    "")
        data['Language_Type'] = label
        df = pd.concat([df, data], axis = 0, ignore_index = True)
        print("--- %s seconds ---" % (time.time() - start_time))
        print('Pre-processing of ',label,' finished \n')
        
    except:
        start_time = time.time()
        data = pd.read_csv(labels[label]+'.csv', sep='delimiter', 
                                           header=None, error_bad_lines=False, 
                                           index_col = False)
        data.columns = ['Sentences']
        data = data[~data.Sentences.str.contains("<")]
        characters = ['1','2','3','4','5','6','7','8','9','0', '@',"'",'#','$','%',
                    '&','/','(',')','"','.',',','?','/','!','=',':',';']
        print(label, 'Data loaded, Pre-processing begun (encoding error)')
        
        for character in characters:
            data.loc[:, 'Sentences'] = data.loc[:, 'Sentences'].str.replace(r''+character, 
                                                                    "")
        
        data['Language_Type'] = label
        df = pd.concat([df, data], axis = 0, ignore_index = True)
        
        print("--- %s seconds ---" % (time.time() - start_time))
        print('Pre-processing of ',label,' finished \n')
 

German de 

German Data loaded, Pre-processing begun


  data.loc[:, 'Sentences'] = data.loc[:, 'Sentences'].str.replace(r''+character,


--- 35.7468466758728 seconds ---
Pre-processing of  German  finished 

English en 

English Data loaded, Pre-processing begun
--- 36.2200562953949 seconds ---
Pre-processing of  English  finished 

Spanish es 

Spanish Data loaded, Pre-processing begun
--- 40.12695789337158 seconds ---
Pre-processing of  Spanish  finished 

French fr 

French Data loaded, Pre-processing begun
--- 42.54697823524475 seconds ---
Pre-processing of  French  finished 

Hungarian hu 

Hungarian Data loaded, Pre-processing begun
--- 6.016124725341797 seconds ---
Pre-processing of  Hungarian  finished 

Romanian ro 

Romanian Data loaded, Pre-processing begun
--- 5.480081558227539 seconds ---
Pre-processing of  Romanian  finished 



In [72]:
df

Unnamed: 0,Sentences,Language,Language_Type
0,Genehmigung des Protokolls der vorangegangenen...,,German
1,Das Protokoll der letzten Sitzung wurde verteilt,,German
2,Gibt es Einwände,,German
3,Herr Präsident ich entspreche hiermit einer vo...,,German
4,Erstens glaube ich daß der vom Vorsitzenden de...,,German
...,...,...,...
1985471,Reluarea sesiunii,,Romanian
1985472,Transferuri de credite a se vedea procesul-verbal,,Romanian
1985473,Depunerea documentelor a se vedea procesul-verbal,,Romanian
1985474,Depunerea documentelor a se vedea procesul-verbal,,Romanian


In [74]:
df['Language_Type'].unique()

array(['German', 'English', 'Spanish', 'French', 'Hungarian', 'Romanian'],
      dtype=object)

In [75]:
df = df.drop(['Language'], axis = 1)

**The dataset, which is going to be used in the upcoming steps**

In [76]:
df

Unnamed: 0,Sentences,Language_Type
0,Genehmigung des Protokolls der vorangegangenen...,German
1,Das Protokoll der letzten Sitzung wurde verteilt,German
2,Gibt es Einwände,German
3,Herr Präsident ich entspreche hiermit einer vo...,German
4,Erstens glaube ich daß der vom Vorsitzenden de...,German
...,...,...
1985471,Reluarea sesiunii,Romanian
1985472,Transferuri de credite a se vedea procesul-verbal,Romanian
1985473,Depunerea documentelor a se vedea procesul-verbal,Romanian
1985474,Depunerea documentelor a se vedea procesul-verbal,Romanian


In [77]:
df.to_csv('eu_lang_data.csv')

In [79]:
df['Language_Type'].value_counts()

French       501840
Spanish      495746
German       492307
English      489049
Hungarian      5120
Romanian       1414
Name: Language_Type, dtype: int64

In [82]:
df2 = df.drop_duplicates(subset=['Sentences'])

In [83]:
df2

Unnamed: 0,Sentences,Language_Type
0,Genehmigung des Protokolls der vorangegangenen...,German
1,Das Protokoll der letzten Sitzung wurde verteilt,German
2,Gibt es Einwände,German
3,Herr Präsident ich entspreche hiermit einer vo...,German
4,Erstens glaube ich daß der vom Vorsitzenden de...,German
...,...,...
1985462,Diverse,Romanian
1985463,Summitul UE-Rusia,Romanian
1985464,- În timpul votului,Romanian
1985466,Madagascar dezbatere,Romanian


**Shuffle the dataset**

In [85]:
df2 = shuffle(df2)

In [86]:
df2

Unnamed: 0,Sentences,Language_Type
1345508,El informe del Parlamento Europeo que ataca el...,Spanish
1051256,En segundo lugar desde la Unión Europea tambié...,Spanish
407002,Abfalldeponien,German
921683,Mr President Mr Böschs report on the independe...,English
35310,Wir sollten nicht so oberflächlich und willfäh...,German
...,...,...
479356,Was Herrn Langen und die vorgeschlagenen Änder...,German
515964,In particular additional measures are required...,English
1538828,Enfin Mesdames et Messieurs les Députés je sui...,French
1068786,Resumiendo sólo puedo reiterar una vez más que...,Spanish


**Lower sentences**

In [87]:
df2['Sentences'] = [s.lower() for s in df2['Sentences']]

In [95]:
df2

Unnamed: 0,Sentences,Language_Type
0,el informe del parlamento europeo que ataca el...,Spanish
1,en segundo lugar desde la unión europea tambié...,Spanish
2,abfalldeponien,German
3,mr president mr böschs report on the independe...,English
4,wir sollten nicht so oberflächlich und willfäh...,German
...,...,...
1858646,was herrn langen und die vorgeschlagenen änder...,German
1858647,in particular additional measures are required...,English
1858648,enfin mesdames et messieurs les députés je sui...,French
1858649,resumiendo sólo puedo reiterar una vez más que...,Spanish


In [92]:
df2 = df2.reset_index()

In [94]:
df2 = df2.drop(['index'], axis = 1)

**Getting rid of stopwords**

In [100]:
stop_words_en = set(stopwords.words('english')) 
stop_words_de = set(stopwords.words('german')) 
stop_words_es = set(stopwords.words('spanish')) 
stop_words_fr = set(stopwords.words('french')) 
stop_words_hu = set(stopwords.words('hungarian')) 
stop_words_ro = set(stopwords.words('romanian')) 

In [103]:
def remove_stopwords(sentence, stop_words):
    tokens_no_stopwords = [w for w in word_tokenize(sentence) if not w in stop_words]
    filtered_sentence = (" ").join(tokens_no_stopwords)
    return filtered_sentence

In [110]:
df_german = df2[df2['Language_Type'] == 'German']

In [112]:
df_german['Sentences'] = [remove_stopwords(i, stop_words_de) for i in df_german['Sentences']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_german['Sentences'] = [remove_stopwords(i, stop_words_de) for i in df_german['Sentences']]


In [115]:
df_hungarian = df2[df2['Language_Type'] == 'Hungarian']

In [117]:
df_hungarian['Sentences'] = [remove_stopwords(i, stop_words_hu) for i in df_hungarian['Sentences']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hungarian['Sentences'] = [remove_stopwords(i, stop_words_hu) for i in df_hungarian['Sentences']]


In [118]:
df_romanian = df2[df2['Language_Type'] == 'Romanian']

In [119]:
df_romanian['Sentences'] = [remove_stopwords(i, stop_words_ro) for i in df_romanian['Sentences']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_romanian['Sentences'] = [remove_stopwords(i, stop_words_ro) for i in df_romanian['Sentences']]


In [120]:
df_english = df2[df2['Language_Type'] == 'English']

In [122]:
df_english['Sentences'] = [remove_stopwords(i, stop_words_en) for i in df_english['Sentences']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_english['Sentences'] = [remove_stopwords(i, stop_words_en) for i in df_english['Sentences']]


In [123]:
df_spanish = df2[df2['Language_Type'] == 'Spanish']

In [124]:
df_spanish['Sentences'] = [remove_stopwords(i, stop_words_es) for i in df_spanish['Sentences']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_spanish['Sentences'] = [remove_stopwords(i, stop_words_es) for i in df_spanish['Sentences']]


In [125]:
df_french = df2[df2['Language_Type'] == 'French']

In [126]:
df_french['Sentences'] = [remove_stopwords(i, stop_words_fr) for i in df_french['Sentences']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_french['Sentences'] = [remove_stopwords(i, stop_words_fr) for i in df_french['Sentences']]


**Another concatenation of the clean dataframes**

In [127]:
df_3 = pd.concat([df_english, df_german], axis = 0, ignore_index = True)

In [128]:
df_3

Unnamed: 0,Sentences,Language_Type
0,mr president mr böschs report independence ucl...,English
1,mr president going back colleagues mr pomés ru...,English
2,course want europe varying speeds pioneering g...,English
3,situation face standstill contract negotiation...,English
4,time seems certain information society develop...,English
...,...,...
919334,wissenschaft forschung - wer arbeiten wer mögl...,German
919335,diesbezüglich geht weißbuch europäische verkeh...,German
919336,zweites möchte nukleare sicherheit erwähnen ku...,German
919337,schwedischen vorsitzes rat vernünftigerweise e...,German


In [129]:
df_3 = pd.concat([df_3, df_french], axis = 0, ignore_index = True)

In [131]:
df_3 = pd.concat([df_3, df_spanish], axis = 0, ignore_index = True)

In [133]:
df_3 = pd.concat([df_3, df_hungarian], axis = 0, ignore_index = True)

In [135]:
df_3 = pd.concat([df_3, df_romanian], axis = 0, ignore_index = True)

In [140]:
df_3 = df_3.reset_index()

In [142]:
df_3 = df_3.drop(['index'], axis = 1)

In [144]:
df_3 = shuffle(df_3)

In [146]:
df_3 = df_3.reset_index()
df_3 = df_3.drop(['index'], axis = 1)

**New, tidy dataset**

In [150]:
df_3

Unnamed: 0,Sentences,Language_Type
0,last but not least möchte hochachtung kommissi...,German
1,mr rehder first place tell referring resolutio...,English
2,frau kommissarin herr präsident werte kollegin...,German
3,sk quiero darle gracias proporcionarnos oportu...,Spanish
4,pouvons effet sous-estimer fait bassin méditer...,French
...,...,...
1858646,primero modo comisión trata asesoramiento cien...,Spanish
1858647,sozialdemokraten wünschen hilfe qualitätsverbe...,German
1858648,three weeks ago given honour chairing committe...,English
1858649,asimismo decir programa presentado corresponde...,Spanish


**Saving the dataframe**

In [151]:
df_3.to_csv('important_dataset_lang.csv')

**Extracting features**

In [156]:
df_3['word_count'] = df['Sentences'].apply(lambda x : len(x.split()))

In [157]:
df_3

Unnamed: 0,Sentences,Language_Type,word_count
0,last but not least möchte hochachtung kommissi...,German,6
1,mr rehder first place tell referring resolutio...,English,7
2,frau kommissarin herr präsident werte kollegin...,German,3
3,sk quiero darle gracias proporcionarnos oportu...,Spanish,34
4,pouvons effet sous-estimer fait bassin méditer...,French,76
...,...,...,...
1858646,primero modo comisión trata asesoramiento cien...,Spanish,155
1858647,sozialdemokraten wünschen hilfe qualitätsverbe...,German,108
1858648,three weeks ago given honour chairing committe...,English,64
1858649,asimismo decir programa presentado corresponde...,Spanish,75


In [162]:
df_3['character_count'] = df['Sentences'].apply(lambda x : len(x.replace(" ","")))

In [163]:
df_3['word_density'] = df_3['word_count'] / (df_3['character_count'] + 1)

In [196]:
df_3

Unnamed: 0,Sentences,Language_Type,word_count,character_count,word_density,num_double_consec_vowels,num_consec_vowels,num_vowels,num_special_vowels,vowel_density,num_unique_words,num_repeated_words,words_vs_unique,num_any_special_character,num_double_consec_consonants,num_consonants,consonant_density
0,last but not least möchte hochachtung kommissi...,German,6,49,0.120000,1,6,39,1,6.500000,39,0,6.500000,39,7,39,6.500000
1,mr rehder first place tell referring resolutio...,English,7,42,0.162791,4,11,52,0,7.428571,51,1,7.285714,53,10,53,7.571429
2,frau kommissarin herr präsident werte kollegin...,German,3,14,0.200000,0,9,28,3,9.333333,26,1,8.666667,28,7,28,9.333333
3,sk quiero darle gracias proporcionarnos oportu...,Spanish,34,212,0.159624,2,51,164,27,4.823529,144,18,4.235294,165,6,164,4.823529
4,pouvons effet sous-estimer fait bassin méditer...,French,76,468,0.162047,0,16,32,8,0.421053,30,2,0.394737,32,8,32,0.421053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1858646,primero modo comisión trata asesoramiento cien...,Spanish,155,831,0.186298,0,21,90,13,0.580645,81,9,0.522581,91,2,91,0.587097
1858647,sozialdemokraten wünschen hilfe qualitätsverbe...,German,108,582,0.185249,0,20,52,5,0.481481,57,1,0.527778,58,7,56,0.518519
1858648,three weeks ago given honour chairing committe...,English,64,310,0.205788,3,22,74,0,1.156250,70,4,1.093750,74,12,74,1.156250
1858649,asimismo decir programa presentado corresponde...,Spanish,75,393,0.190355,0,8,43,5,0.573333,43,3,0.573333,46,2,43,0.573333


In [168]:
vowels = ['a','e','i','o','u']
special_vowels = ['á','é','í','ó','ú','ü','ö']
same_consecutive_vowels = ['aa','ee', 'ii', 'oo', 'uu'] 
consecutive_vowels = [''.join(p) for p in permutations(vowels,2)]

In [169]:
df_3['num_double_consec_vowels'] = df_3['Sentences'].apply(lambda x : sum([any(c_v in a for c_v in same_consecutive_vowels) for a in x.split()]))

In [332]:
#df_3['num_consec_vowels'] = df_3['Sentences'].apply(lambda x : sum([any(c_v in a for c_v in consecutive_vowels) for a in x.split()]))
#df_3['num_vowels'] = df_3['Sentences'].apply(lambda x : sum([any(v in a for v in vowels) for a in x.split()]))

In [172]:
df_3['num_special_vowels'] = df_3['Sentences'].apply(lambda x : sum([any(v in a for v in special_vowels) for a in x.split()]))
df_3['vowel_density'] = df_3['num_vowels'] / df_3['word_count']

In [173]:
df_3['num_unique_words'] = df_3['Sentences'].apply(lambda x: len(set(w for w in x.split())))

In [177]:
df_3['num_repeated_words'] = df_3['Sentences'].apply(lambda x: len([w for w in collections.Counter(x.split()).values() if w > 1]))

In [179]:
df_3['words_vs_unique'] = df_3['num_unique_words'] / df_3['word_count']

In [183]:
alphabet_string = string.ascii_lowercase
alphabet_list = list(alphabet_string)

In [185]:
df_3['num_any_special_character'] = df_3['Sentences'].apply(lambda x : sum([any(not spc in sp for spc in alphabet_list) for sp in x.split()]))

In [186]:
consonants = [item for item in alphabet_list if item not in vowels]

In [188]:
consecutive_consonants = [''.join(p) for p in permutations(consonants,2)]

In [190]:
#df_3['num_consec_consonants'] = df_3['Sentences'].apply(lambda x : sum([any(c_v in a for c_v in consecutive_consonants) for a in x.split()]))

In [191]:
same_consecutive_consonants = [str(l) + str(l) for l in consonants]

In [193]:
df_3['num_double_consec_consonants'] = df_3['Sentences'].apply(lambda x : sum([any(c_c in b for c_c in same_consecutive_consonants) for b in x.split()]))

In [194]:
df_3['num_consonants'] = df_3['Sentences'].apply(lambda x : sum([any(c in aa for c in consonants) for aa in x.split()]))

In [195]:
df_3['consonant_density'] = df_3['num_consonants'] / df_3['word_count']

In [199]:
len(df_3)

1858651

In [200]:
df_4 = df_3.dropna()

In [203]:
len(df_3.dropna())

1858651

In [204]:
df_3 = df_3.drop(['vowel_density', 'words_vs_unique', 'consonant_density'], axis = 1)

In [205]:
used_df = df_3.groupby('Language_Type').mean().T

In [206]:
used_df

Language_Type,English,French,German,Hungarian,Romanian,Spanish
word_count,71.726823,71.603246,71.586164,71.63875,69.635849,71.626631
character_count,380.832341,380.209385,380.112526,379.587903,371.709434,380.433687
word_density,0.1879,0.187849,0.187893,0.188608,0.187805,0.187832
num_double_consec_vowels,1.310052,0.164084,0.428013,0.037222,0.198113,0.170674
num_consec_vowels,11.727756,19.645086,12.238664,0.755068,1.801887,10.599407
num_vowels,37.256517,44.500277,35.284596,5.715852,4.560377,39.206592
num_special_vowels,0.038221,8.781594,3.026262,4.301429,0.013208,6.975626
num_unique_words,33.773925,40.757981,33.15903,6.100698,4.707547,35.777955
num_repeated_words,3.346745,3.65372,2.218763,0.121967,0.022642,3.043442
num_any_special_character,38.046421,45.33739,35.846855,6.238285,4.732075,39.614444


In [207]:
df_4 = df_3.corr(method ='pearson')

In [208]:
df_4

Unnamed: 0,word_count,character_count,word_density,num_double_consec_vowels,num_consec_vowels,num_vowels,num_special_vowels,num_unique_words,num_repeated_words,num_any_special_character,num_double_consec_consonants,num_consonants
word_count,1.0,0.980592,0.131036,0.000559,-0.001054,-0.000681,-0.000978,-0.000591,-0.00069,-0.000623,-0.001018,-0.000665
character_count,0.980592,1.0,-0.002371,0.000489,-0.000996,-0.000602,-0.0009,-0.000514,-0.000638,-0.000548,-0.000955,-0.000592
word_density,0.131036,-0.002371,1.0,4.6e-05,-0.000604,-0.000485,-0.000592,-0.000527,0.000273,-0.000408,-0.000686,-0.000435
num_double_consec_vowels,0.000559,0.000489,4.6e-05,1.0,0.164866,0.25193,-0.185279,0.239247,0.257866,0.254152,0.18731,0.25546
num_consec_vowels,-0.001054,-0.000996,-0.000604,0.164866,1.0,0.888362,0.620745,0.882603,0.738713,0.888474,0.718354,0.886251
num_vowels,-0.000681,-0.000602,-0.000485,0.25193,0.888362,1.0,0.621474,0.989857,0.838428,0.998893,0.651577,0.999273
num_special_vowels,-0.000978,-0.0009,-0.000592,-0.185279,0.620745,0.621474,1.0,0.621682,0.490395,0.619104,0.354267,0.616442
num_unique_words,-0.000591,-0.000514,-0.000527,0.239247,0.882603,0.989857,0.621682,1.0,0.767794,0.98954,0.655117,0.989642
num_repeated_words,-0.00069,-0.000638,0.000273,0.257866,0.738713,0.838428,0.490395,0.767794,1.0,0.842471,0.529926,0.839235
num_any_special_character,-0.000623,-0.000548,-0.000408,0.254152,0.888474,0.998893,0.619104,0.98954,0.842471,1.0,0.654699,0.999165


**Train and test set**

In [213]:
#split dataset into features and target variable
feature_cols = list(df_3.columns)[2:]
X = df_3[feature_cols] # Features
y = df_3[['Language_Type']] # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 80% train and 20% test

**Reduce correlation among features**

In [216]:
# Standardize the data
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train)
# Transform both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Make an instance of the model to retain 95% of the variance within the old features.
pca = PCA(.95)
pca.fit(X_train)
w
print('Number of Principal Components = '+str(pca.n_components_))
# Number of Principal Components = 13

X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

Number of Principal Components = 6


**Model 1: Decision Tree**

In [218]:
dt_clf = DecisionTreeClassifier() # Create Decision Tree classifer object
dt_clf = dt_clf.fit(X_train,y_train) # Fit/Train Decision Tree Classifer on training set

# Save model to file in the current working directory so that it can be imported and used.
# I use the pickle library to save the parameters of the trained model
pkl_file = "decision_tree_model.pkl"
with open(pkl_file, 'wb') as file:
    pickle.dump(dt_clf, file)

# Load previously trained model from pickle file
with open(pkl_file, 'rb') as file:
    dt_clf = pickle.load(file)

dt_clf # parameters of the Decision Tree model are shown below and can be further optimized to improve model performance

y_pred = dt_clf.predict(X_test) #Predict the response for test dataset

In [241]:
y_test

Unnamed: 0,Language_Type
1495689,French
199606,Spanish
1482952,German
1444374,German
593883,German
...,...
1534297,English
664544,German
1247132,French
1006105,French


In [222]:
accuracy_score_dt = accuracy_score(y_test, y_pred)

**Decision Tree accuracy score**

In [223]:
accuracy_score_dt

0.7733872074161154

**Model 2: Random Forest**

In [235]:
rf_clf = RandomForestClassifier(n_estimators=100) # Create Random Forest classifer object
rf_clf = rf_clf.fit(X_train,y_train) # Fit/Train Random Forest Classifer on training set

# Save model to file in the current working directory so that it can be imported and used.
pkl_file = "random_forest_model.pkl"
with open(pkl_file, 'wb') as file:
    pickle.dump(rf_clf, file)

# Load previously trained model from pickle file
with open(pkl_file, 'rb') as file:
    rf_clf = pickle.load(file)
rf_clf

y_pred_random_forest = rf_clf.predict(X_test) #Predict the response for test dataset

  rf_clf = rf_clf.fit(X_train,y_train) # Fit/Train Random Forest Classifer on training set


**Random Forest Accuracy Score**

In [237]:
accuracy_score_rf = accuracy_score(y_test, y_pred_random_forest)

In [238]:
accuracy_score_rf

0.8275688602779967

**Model 3: Gradient Boost**

In [231]:
gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=1)
gb_clf.fit(X_train, y_train)

# Save model to file in the current working directory so that it can be imported and used.
# I use the pickle library to save the parameters of the trained model
pkl_file = "gradient_boost_model.pkl"
with open(pkl_file, 'wb') as file:
    pickle.dump(gb_clf, file)

# Load previously trained model from pickle file
with open(pkl_file, 'rb') as file:
    gb_clf = pickle.load(file)

gb_clf # parameters of the Gradient Boost model are shown below

y_pred_gb = gb_clf.predict(X_test)

  return f(*args, **kwargs)


**Gradient Boost Accuracy**

In [232]:
accuracy_score_gb = accuracy_score(y_test, y_pred_gb)

In [233]:
accuracy_score_gb

0.6211857499105536

**Example no. 1**

In [283]:
text = ["come on man lets get outta here, we need to go"]

In [284]:
text_dataframe = pd.DataFrame(text, columns = ['Sentences'])

In [285]:
text_dataframe

Unnamed: 0,Sentences
0,"come on man lets get outta here, we need to go"


**Function for feature extraction**

In [287]:
def features(dataframe):
    vowels = ['a','e','i','o','u']
    special_vowels = ['á','é','í','ó','ú','ü','ö']
    same_consecutive_vowels = ['aa','ee', 'ii', 'oo', 'uu'] 
    consecutive_vowels = [''.join(p) for p in permutations(vowels,2)]
    
    dataframe['word_count'] = dataframe['Sentences'].apply(lambda x : len(x.split()))
    dataframe['character_count'] = dataframe['Sentences'].apply(lambda x : len(x.replace(" ","")))
    dataframe['word_density'] = dataframe['word_count'] / (dataframe['character_count'] + 1)
    
    dataframe['num_double_consec_vowels'] = dataframe['Sentences'].apply(lambda x : sum([any(c_v in a for c_v in same_consecutive_vowels) for a in x.split()]))
    dataframe['num_consec_vowels'] = dataframe['Sentences'].apply(lambda x : sum([any(c_v in a for c_v in consecutive_vowels) for a in x.split()]))
    dataframe['num_vowels'] = dataframe['Sentences'].apply(lambda x : sum([any(v in a for v in vowels) for a in x.split()]))
    dataframe['num_special_vowels'] = dataframe['Sentences'].apply(lambda x : sum([any(v in a for v in special_vowels) for a in x.split()]))

    dataframe['num_unique_words'] = dataframe['Sentences'].apply(lambda x: len(set(w for w in x.split())))
    dataframe['num_repeated_words'] = dataframe['Sentences'].apply(lambda x: len([w for w in collections.Counter(x.split()).values() if w > 1]))
    
    alphabet_string = string.ascii_lowercase
    alphabet_list = list(alphabet_string)
    
    dataframe['num_any_special_character'] = dataframe['Sentences'].apply(lambda x : sum([any(not spc in sp for spc in alphabet_list) for sp in x.split()]))
    
    consonants = [item for item in alphabet_list if item not in vowels]
    same_consecutive_consonants = [str(l) + str(l) for l in consonants]
    
    dataframe['num_double_consec_consonants'] = dataframe['Sentences'].apply(lambda x : sum([any(c_c in b for c_c in same_consecutive_consonants) for b in x.split()]))
    dataframe['num_consonants'] = dataframe['Sentences'].apply(lambda x : sum([any(c in aa for c in consonants) for aa in x.split()]))
    
    return dataframe

In [288]:
new_text_dataframe = features(text_dataframe)

In [300]:
new_text_dataframe

Unnamed: 0,Sentences,language,word_count,character_count,word_density,num_double_consec_vowels,num_consec_vowels,num_vowels,num_special_vowels,num_unique_words,num_repeated_words,num_any_special_character,num_double_consec_consonants,num_consonants
0,"come on man lets get outta here, we need to go",english,11,36,0.297297,1,1,11,0,11,0,11,1,11


In [298]:
new_text_dataframe = new_text_dataframe.drop(['language'], axis = 1)

In [299]:
new_text_dataframe.insert(1, 'language', 'english')

In [301]:
feature_cols_ex = list(new_text_dataframe.columns)[2:]

In [302]:
X_ex = new_text_dataframe[feature_cols_ex]

In [303]:
X_ex

Unnamed: 0,word_count,character_count,word_density,num_double_consec_vowels,num_consec_vowels,num_vowels,num_special_vowels,num_unique_words,num_repeated_words,num_any_special_character,num_double_consec_consonants,num_consonants
0,11,36,0.297297,1,1,11,0,11,0,11,1,11


In [304]:
X_ex = scaler.transform(X_ex)

In [305]:
X_ex = pca.transform(X_ex)

In [306]:
y_pred_ex = dt_clf.predict(X_ex)

**Prediction for example no. 1**

In [307]:
y_pred_ex

array(['English'], dtype=object)

**Example no. 2**

In [308]:
text_spanish = ["ola dios mio que estas haciendo"]

In [309]:
text_dataframe_es = pd.DataFrame(text_spanish, columns = ['Sentences'])

In [310]:
new_text_dataframe_es = features(text_dataframe_es)

In [311]:
new_text_dataframe_es.insert(1, 'language', 'english')

In [312]:
new_text_dataframe_es

Unnamed: 0,Sentences,language,word_count,character_count,word_density,num_double_consec_vowels,num_consec_vowels,num_vowels,num_special_vowels,num_unique_words,num_repeated_words,num_any_special_character,num_double_consec_consonants,num_consonants
0,ola dios mio que estas haciendo,english,6,26,0.222222,0,4,6,0,6,0,6,0,6


In [313]:
feature_cols_ex_es = list(new_text_dataframe_es.columns)[2:]

In [314]:
X_ex_es = new_text_dataframe_es[feature_cols_ex_es]

In [315]:
X_ex_es = scaler.transform(X_ex_es)

In [316]:
X_ex_es = pca.transform(X_ex_es)

In [319]:
y_pred_ex_es = dt_clf.predict(X_ex_es)

**Prediction for example no. 2 with Decision Tree**

In [320]:
y_pred_ex_es

array(['English'], dtype=object)

In [326]:
y_pred_random_forest_es = rf_clf.predict(X_ex_es)

**Prediction for example no. 2 with Random Forest**

In [327]:
y_pred_random_forest_es

array(['English'], dtype=object)

In [323]:
new_text_dataframe_es = new_text_dataframe_es.drop(['language'], axis = 1)

In [325]:
new_text_dataframe_es.insert(1, 'language', 'spanish')

In [328]:
y_pred_gb_es = gb_clf.predict(X_ex_es)

**Prediction for example no. 2 with Gradient Boost**

In [329]:
y_pred_gb_es

array(['Romanian'], dtype=object)