In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split


In [2]:

data = pd.read_csv('../input/language-detection/Language Detection.csv')
data.describe()

Unnamed: 0,Text,Language
count,10337,10337
unique,10267,17
top,Jag är ledsen.,English
freq,3,1385


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data['Text'], data['Language'], test_size=0.5)

In [4]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,3), use_idf=False)

In [5]:
clf = Pipeline([
        ('vec', vectorizer),
        ('clf', Perceptron())
               ])

In [6]:
clf.fit(x_train, y_train)

Pipeline(steps=[('vec',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3),
                                 use_idf=False)),
                ('clf', Perceptron())])

In [7]:
predictions = clf.predict(x_test)

In [8]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       265
      Danish       0.97      0.92      0.94       215
       Dutch       0.96      0.97      0.96       259
     English       0.99      0.98      0.99       678
      French       0.99      0.97      0.98       529
      German       0.96      0.96      0.96       246
       Greek       0.96      1.00      0.98       186
       Hindi       0.91      1.00      0.96        32
     Italian       0.95      0.97      0.96       341
     Kannada       1.00      1.00      1.00       175
   Malayalam       0.96      1.00      0.98       299
  Portugeese       0.98      0.91      0.94       384
     Russian       1.00      1.00      1.00       350
     Spanish       0.92      0.97      0.95       404
    Sweedish       0.96      0.98      0.97       348
       Tamil       1.00      1.00      1.00       230
     Turkish       1.00      0.97      0.98       228

    accuracy              

In [9]:
sentences = [
    'यह भाषा को पहचानने की परीक्षा है।',
    'Dies ist eine Demo'
]

predict = clf.predict(sentences)

result = pd.DataFrame({'Sentence' : sentences, 'Language' : predict})
result
    

Unnamed: 0,Sentence,Language
0,यह भाषा को पहचानने की परीक्षा है।,Hindi
1,Dies ist eine Demo,German
