## Importing libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

## Import the dataset

In [None]:
df = pd.read_csv("/content/Language Detection.csv")
df.head(20)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
5,"[2] In ancient philosophy, natura is mostly us...",English
6,"[3][4] \nThe concept of nature as a whole, the...",English
7,During the advent of modern scientific method ...,English
8,"[5][6] With the Industrial revolution, nature ...",English
9,"However, a vitalist vision of nature, closer t...",English


## Display different languages values

In [None]:
df['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

## Labeling

In [None]:
X = df["Text"]
y = df["Language"]

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Remove unwanted characters

In [None]:
data = []

for text in X:
        text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
        text = re.sub(r'[[]]', ' ', text)
        text = text.lower()
        data.append(text)

## Feature extraction 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data).toarray()
X.shape # (10337, 39419)

(10337, 34937)

## Model training

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
y_pred = model.predict(x_test)

## Model evaluation

In [None]:
from sklearn.metrics import accuracy_score
ac = accuracy_score(y_test, y_pred)

print("Accuracy is :",ac)

Accuracy is : 0.9729206963249516


## Language prediction method

In [None]:
def predictLanguage(text):
     x = cv.transform([text]).toarray()
     lang = model.predict(x)
     lang = le.inverse_transform(lang)
     print(lang[0])

In [None]:
predictLanguage("I'm looking for a train from Paris to Marseille")

The langauge is in English


In [None]:
predictLanguage("Je cherche un train depuis Paris jusqu'à Marseille")

The langauge is in French
