In [1]:
import string
import re
import codecs
import numpy as np
import pandas as ps
import seaborn as sns #for visualisation
import matplotlib.pyplot as plt #for visualisation
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
eng_df=ps.read_csv("English.txt","utf-8",header=None,names=['English'],engine='python')
eng_df.head()


Unnamed: 0,English
0,THERE IS AN ILLUSTRATED EDITION OF THIS TITLE ...
1,[# 42671 ]
2,cover
3,Pride and Prejudice
4,By Jane Austen


In [3]:
french_df=ps.read_csv("French.txt","utf-8",header=None,names=['French'],engine='python')
french_df.head()

Unnamed: 0,French
0,"Descartes, RenÃ©"
1,"_Oeuvres de Descartes, prÃ©cÃ©dÃ©es de l'Ã©log..."
2,Thomas_
3,OEUVRES DE DESCARTES.
4,TOME PREMIER


In [4]:
ita_df=ps.read_csv("Italian.txt","utf-8",header=None,names=['Italian'],engine='python')
ita_df.head()

Unnamed: 0,Italian
0,LA DIVINA COMMEDIA
1,di Dante Alighieri
2,INFERNO
3,Inferno â€¢ Canto I
4,Nel mezzo del cammin di nostra vita


In [5]:
spanish_df=ps.read_csv("spanish.txt","utf-8",header=None,names=['Spanish'],engine='python')
spanish_df.head()


Unnamed: 0,Spanish
0,BIBLIOTECA de LA NACIÃ“N
1,EDMUNDO ABOUT
2,GERMANA
3,TRADUCCIÃ“N DE
4,T. ORTS-RAMOS


In [6]:
german_df=ps.read_csv("German.txt","utf-8",header=None,names=['German'],engine='python')
german_df.head()


Unnamed: 0,German
0,Die
1,Falkner vom Falkenhof
2,Roman von
3,Euf. v. Adlersfeld-Ballestrem
4,FÃ¼nfundzwanzigste Auflage


In [7]:
data_eng=[]
lang_eng=[]
for i,line in eng_df.iterrows():
    line=line['English']
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(dict((ord(char),None) for char in string.punctuation))
        data_eng.append(line)
        lang_eng.append("English")       

In [8]:
data_french=[]
lang_french=[]
for i,line in french_df.iterrows():
    line=line['French']
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(dict((ord(char),None) for char in string.punctuation))
        data_french.append(line)
        lang_french.append("French") 

In [9]:
data_ita=[]
lang_ita=[]
for i,line in ita_df.iterrows():
    line=line['Italian']
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(dict((ord(char),None) for char in string.punctuation))
        data_ita.append(line)
        lang_ita.append("Italian") 

In [10]:
data_spa=[]
lang_spa=[]
for i,line in spanish_df.iterrows():
    line=line['Spanish']
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(dict((ord(char),None) for char in string.punctuation))
        data_spa.append(line)
        lang_spa.append("Spanish")
       

In [11]:
data_ger=[]
lang_ger=[]
for i,line in german_df.iterrows():
    line=line['German']
    if len(line)!=0:
        line=line.lower()
        line=re.sub(r"\d+","",line)
        line=line.translate(dict((ord(char),None) for char in string.punctuation))
        data_ger.append(line)
        lang_ger.append("German")
       

In [12]:
df=ps.DataFrame({"Text":data_eng+data_ger+data_spa+data_ita+data_french,"language":lang_eng+lang_ger+lang_spa+lang_ita+lang_french})

df.shape
df

Unnamed: 0,Text,language
0,there is an illustrated edition of this title ...,English
1,,English
2,cover,English
3,pride and prejudice,English
4,by jane austen,English
...,...,...
47566,this web site includes information about proje...,French
47567,including how to make donations to the project...,French
47568,archive foundation how to help produce our new...,French
47569,subscribe to our email newsletter to hear abou...,French


In [13]:
# splitting data in train and test sets
x,y=df.iloc[:,0],df.iloc[:,1]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(38056,)
(9515,)
(38056,)
(9515,)


In [14]:
# vectorizer using tfidf
vectorizer=feature_extraction.text.TfidfVectorizer(ngram_range=(1,3),analyzer='char')

pipe_lr_r13=pipeline.Pipeline([('vectorizer',vectorizer),('clf',linear_model.LogisticRegression(max_iter=10000000))])

#model fitting

pipe_lr_r13.fit(x_train,y_train)


Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', LogisticRegression(max_iter=10000000))])

In [15]:
#model prediction
y_predicted=pipe_lr_r13.predict(x_test)
y_predicted

array(['Italian', 'French', 'English', ..., 'English', 'Italian',
       'Spanish'], dtype=object)

In [16]:
# model Evaluation
acc=(metrics.accuracy_score(y_test,y_predicted))*100
acc

97.5827640567525

In [17]:
matrix=metrics.confusion_matrix(y_test,y_predicted)
matrix

array([[2399,    8,    1,   12,   16],
       [  37, 2012,    0,   12,   11],
       [   3,    0,  800,    2,    1],
       [  48,   10,    1, 2866,   10],
       [  27,   13,    2,   16, 1208]], dtype=int64)

In [18]:
pipe_lr_r13.predict(["thusssy wiants jhatu"])

array(['English'], dtype=object)

In [23]:
import pickle
lrFile=open('LIModel.pkl','wb')
pickle.dump(pipe_lr_r13,lrFile)
lrFile.close()

In [32]:
import pandas as pd
def lang_detect(text):
    import pickle
    import string 
    import re
    import numpy as np
    translate_table=dict((ord(char),None) for char in string.punctuation)
    
    global langDetectModel
    l_file=open("LIModel.pkl","rb")
    langDetectModel=pickle.load(l_file)
    l_file.close()
    
    text=" ".join(text.split())
    text =text.lower()
    text=re.sub(r"\d+","",text)
    text=text.translate(translate_table)
    pred=langDetectModel.predict([text])
    prob=langDetectModel.predict_proba([text])
    return pred[0]

In [34]:
lang_detect("fahr zur Hölle")

'German'