In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import warnings
warnings.simplefilter("ignore")

In [3]:
data = pd.read_csv('dataset.csv')
print(data.shape)

(22000, 2)


In [4]:
data

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


In [5]:
# dropping duplicates,  if any
print('Before dropping duplicates: ', data.shape)
data.drop_duplicates()
print('After dropping duplicates: ', data.shape)

Before dropping duplicates:  (22000, 2)
After dropping duplicates:  (22000, 2)


No duplicates were present.

In [6]:
# checking for NA values
data.isnull().sum()

Text        0
language    0
dtype: int64

No null records.

In [7]:
data['Text'][0]

'klement gottwaldi surnukeha palsameeriti ning paigutati mausoleumi surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke  aastal viidi ta surnukeha mausoleumist ära ja kremeeriti zlíni linn kandis aastatel – nime gottwaldov ukrainas harkivi oblastis kandis zmiivi linn aastatel – nime gotvald'

In [8]:
data['language'].value_counts()

Urdu          1000
Tamil         1000
Pushto        1000
Hindi         1000
English       1000
Russian       1000
French        1000
Indonesian    1000
Latin         1000
Portugese     1000
Korean        1000
Chinese       1000
Dutch         1000
Arabic        1000
Spanish       1000
Estonian      1000
Turkish       1000
Thai          1000
Persian       1000
Romanian      1000
Swedish       1000
Japanese      1000
Name: language, dtype: int64

All the languages have equal number of records.

In [9]:
data['language'].unique().size

22

22 unique languages present in the dataset.

'Language' is the dependent and 'Text' is the independent variable in the given dataset. 

In [10]:
X = data['Text']
y = data['language']

'Text' is categorical variable but ML algorithms work with numerical data. So we have to first convert this text into labelled numerical data.

In [11]:
le = LabelEncoder()
encoded_lang = le.fit_transform(y)

In [12]:
data['encoded_lang'] = encoded_lang

In [13]:
print(data.head())

                                                Text  language  encoded_lang
0  klement gottwaldi surnukeha palsameeriti ning ...  Estonian             4
1  sebes joseph pereira thomas  på eng the jesuit...   Swedish            17
2  ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...      Thai            19
3  விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...     Tamil            18
4  de spons behoort tot het geslacht haliclona en...     Dutch             2


In [14]:
# for storing new texts
new_text = []

# converting everything to lower case
for text in X:
    text = text.lower()
    new_text.append(text)


In [15]:
new_text[0:5]

['klement gottwaldi surnukeha palsameeriti ning paigutati mausoleumi surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke  aastal viidi ta surnukeha mausoleumist ära ja kremeeriti zlíni linn kandis aastatel – nime gottwaldov ukrainas harkivi oblastis kandis zmiivi linn aastatel – nime gotvald',
 'sebes joseph pereira thomas  på eng the jesuits and the sino-russian treaty of nerchinsk  the diary of thomas pereira bibliotheca instituti historici s i --   rome libris ',
 'ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เริ่มตั้งแต่ถนนสนามไชยถึงแม่น้ำเจ้าพระยาที่ถนนตก กรุงเทพมหานคร เป็นถนนรุ่นแรกที่ใช้เทคนิคการสร้างแบบตะวันตก ปัจจุบันผ่านพื้นที่เขตพระนคร เขตป้อมปราบศัตรูพ่าย เขตสัมพันธวงศ์ เขตบางรัก เขตสาทร และเขตบางคอแหลม',
 'விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திரிகை-விசாகப்பட்டின ஆசிரியர் சம்பத்துடன் இணைந்து விரிவுபடுத்தினார்  ஆண்டுகள் தொடர்ந்து செயலராக இருந்து தமிழ்மன்றத்தை நடத்திச் சென்றார்  கோவை செம்மொழி மாநாட்டில் "தமிழ்ச்சங்கங்களு

Getting features out of the text.

https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/

In [16]:
cv = CountVectorizer(max_features = 30000)
X = cv.fit_transform(new_text).toarray()

In [17]:
cv.get_feature_names()

['aabdel',
 'aafrika',
 'aan',
 'aandacht',
 'aangebracht',
 'aansluit',
 'aansluiting',
 'aantal',
 'aanvallen',
 'aanvankelijk',
 'aanwezig',
 'aasta',
 'aastad',
 'aastail',
 'aastaks',
 'aastal',
 'aastane',
 'aastani',
 'aastas',
 'aastast',
 'aastat',
 'aastate',
 'aastatel',
 'ab',
 'abad',
 'abaixo',
 'abandonar',
 'abbey',
 'abc',
 'abd',
 'abdde',
 'abduction',
 'abdul',
 'abdullah',
 'abdurrahman',
 'abelisaurus',
 'aberllefenni',
 'aberta',
 'abertura',
 'abhinc',
 'abia',
 'abielu',
 'abiga',
 'abikaasa',
 'abil',
 'able',
 'about',
 'above',
 'abraham',
 'abre',
 'abrigo',
 'abril',
 'abs',
 'absoluta',
 'absolutam',
 'absolvit',
 'abstract',
 'abu',
 'ac',
 'acabaram',
 'acabou',
 'acabó',
 'academia',
 'academiae',
 'academic',
 'academie',
 'academiei',
 'academy',
 'acapella',
 'acara',
 'acasă',
 'accepit',
 'acceptată',
 'accepte',
 'access',
 'accessible',
 'accident',
 'accidentare',
 'acciones',
 'acción',
 'accommodate',
 'accord',
 'according',
 'account',
 'ac

In [18]:
X.shape

(22000, 30000)

22000 rows and 30000 columns/features.

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

### Mulitnomial NB

In [20]:
model = MultinomialNB()

In [21]:
model.fit(X_train, y_train)

MultinomialNB()

In [22]:
y_pred = model.predict(X_test)

In [23]:
y_pred

array(['Latin', 'Romanian', 'Turkish', ..., 'Thai', 'Urdu', 'Swedish'],
      dtype='<U10')

Accuracy:

In [24]:
ac = accuracy_score(y_test, y_pred)
print(ac)

0.9340909090909091


### Bernoulli NB 

In [25]:
model2 = BernoulliNB()

In [26]:
model2.fit(X_train, y_train)

BernoulliNB()

In [27]:
y_pred2 = model2.predict(X_test)

In [28]:
y_pred2

array(['Latin', 'Romanian', 'Turkish', ..., 'Thai', 'Urdu', 'Swedish'],
      dtype='<U10')

In [29]:
ac2 = accuracy_score(y_test, y_pred2)
print(ac2)

0.9279545454545455


### Logistic Regression

In [30]:
model3 = LogisticRegression()

In [31]:
model3.fit(X_train, y_train)

LogisticRegression()

In [32]:
y_pred3 = model3.predict(X_test)

In [33]:
y_pred3

array(['Latin', 'Romanian', 'Turkish', ..., 'Thai', 'Urdu', 'Swedish'],
      dtype=object)

In [34]:
ac3 = accuracy_score(y_test, y_pred3)
print(ac3)

0.9497727272727273
