# Build Language Detection Model

## Importing Libraries

In [1]:
# Importing libraries

import string
import re
import codecs
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import feature_extraction, linear_model, pipeline, metrics
from sklearn.model_selection import train_test_split

## Loading dataset

In [2]:
# Get Language Detection dataset from kaggle
import kagglehub

path = kagglehub.dataset_download("basilb2s/language-detection")

In [3]:
# import os
# os.listdir(f"{path}/")

In [4]:
df = pd.read_csv(f"{path}/Language Detection.csv")
df.head(2)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English


In [5]:
# to check how many different languages are there in this dataset
df["Language"].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [6]:
# To take data from only one language

# English
eng_df = df[df["Language"] == "English"]

# Hindi
hindi_df = df[df["Language"] == "Hindi"]

In [7]:
hindi_df

Unnamed: 0,Text,Language
1979,विकि-शब्दकोष (एक मुक्त शब्दकोष एवं समानांतर को...,Hindi
1980,"[42] अंत में, विकिपीडिया एक पक्ष नहीं लेता है।...",Hindi
1981,बोट्स नामक कंप्यूटर प्रोग्राम के निर्माण के बा...,Hindi
1982,"""""नहीं, हम नहीं जानते"", जिमी ने कहा.",Hindi
1983,[60] कुछ आलोचकों का दावा है कि विकिपीडिया की ख...,Hindi
...,...,...
2037,मैं सोच रहा था कि क्या यो मेरी मदद कर सकता है।...,Hindi
2038,कैसे कुछ आइसक्रीम के बारे में?,Hindi
2039,क्या आप मुझे एक सवारी घर देना चाहेंगे?,Hindi
2040,"जी बोलिये। यह बहुत अच्छा होगा, धन्यवाद। मैं ठी...",Hindi


## Data Pre-Processing

In [8]:
for char in string.punctuation:
  print(char, end=" ")

# This code will map each of these punctuations to None,
# so overall it will remove special characters after translation

trans_table = str.maketrans("", "", string.punctuation)

# str.maketrans(x, y, z):

# This function creates a mapping table for string translation.
# The first two arguments (x and y) are used for character replacements (which we are not using here).
# The third argument (z) contains characters that should be removed.

# other way to do it
# trans_table = dict((ord(char), None) for char in string.punctuation)


! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

In [25]:
trans_table

{33: None,
 34: None,
 35: None,
 36: None,
 37: None,
 38: None,
 39: None,
 40: None,
 41: None,
 42: None,
 43: None,
 44: None,
 45: None,
 46: None,
 47: None,
 58: None,
 59: None,
 60: None,
 61: None,
 62: None,
 63: None,
 64: None,
 91: None,
 92: None,
 93: None,
 94: None,
 95: None,
 96: None,
 123: None,
 124: None,
 125: None,
 126: None}

In [9]:
for i,line in enumerate(df["Text"]):
  line = line.lower() # convert to lowercase
  line = line.translate(trans_table) # translate all special characters to None
  line = re.sub(r"\d+","",line) # remove all digits
  df.loc[i,"Text"] = line # update the original dataset text

In [10]:
df.head(2)
print(df.shape)

(10337, 2)


In [11]:
# Splitting Data into Train and Test
X , y = df["Text"], df["Language"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model Building

In [12]:
# Vectorization and Model fitting Pipeline

vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,3),analyzer="char")

pipe_lr_r13 = pipeline.Pipeline([
    ("vectorizer", vectorizer),
    ("clf", linear_model.LogisticRegression())
])

In [13]:
# Model Fitting
pipe_lr_r13.fit(X_train, y_train)

In [14]:
# model Prediction
y_pred = pipe_lr_r13.predict(X_test)

In [15]:
# Model evaluation
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix: \n{conf_matrix}")

Accuracy: 0.9835589941972921
Confusion Matrix: 
[[105   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1]
 [  0  71   0   0   0   0   0   0   0   0   0   0   0   0   2   0   0]
 [  0   0 108   3   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   1 290   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   2 214   0   0   0   1   0   0   0   0   0   0   0   2]
 [  0   0   1   1   0  89   0   0   0   0   0   0   0   0   1   0   1]
 [  0   0   0   0   0   0  68   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  10   0   0   0   0   0   0   0   0   0]
 [  0   1   0   1   0   0   0   0 141   0   0   1   0   0   0   0   1]
 [  0   0   0   0   0   0   0   0   0  66   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 120   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   0   0 141   0   1   0   0   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0 136   0   0   0   0]
 [  0   0   0   0   0   0   0

## Saving the model

In [17]:
# Saving the model so that it can be used by fifferent consumers

# import pickle
# lrFile = open("lrModel.pkl", "wb")
# pickle.dump(pipe_lr_r13, lrFile)
# lrFile.close()

import pickle
with open("lrModel.pkl", "wb") as lrFile:
    pickle.dump(pipe_lr_r13, lrFile)
lrFile.close()


## Loading the Model

In [18]:
global lrModel

# lrFile = open("lrModel.pkl", "rb")
# lrModel = pickle.load(lrFile)
# lrFile.close()

with open("lrModel.pkl", "rb") as lrFile:
    lrModel = pickle.load(lrFile)
    lrFile.close()

In [24]:
# to use this model, your input take should go from the same preprocessing steps that are done while building model
# Remove punctuation, lowercase, remove numeric data, vectorization etc.

# Here, I am using a highly optimised input for prediction

pred = lrModel.predict(["नहीं, हम नहीं जानते, जिमी ने कहा.	"])
print(pred)

['Hindi']


In [20]:
# Check if the model is fitted
print("Pipeline Steps:", lrModel.steps)

Pipeline Steps: [('vectorizer', TfidfVectorizer(analyzer='char', ngram_range=(1, 3))), ('clf', LogisticRegression())]
