## Language Detection Model

https://medium.com/@naman884/how-to-deploy-ml-models-on-aws-ecs-using-docker-and-fastapi-9acdd3619348

### Importing Packages

In [2]:
import re
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
#remove stop-words
from nltk.corpus import stopwords # library
nltk.download('stopwords')
all_stopwords = set(stopwords.words('english')) # set the language
from typing import List

import warnings
warnings.simplefilter("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rawal.an/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Model Building

Simple and quick way to develop the model because our goal is to understand, how to deploy the model on AWS ECS.

In [7]:
# Loading the dataset
data = pd.read_csv("abstracts_paraphrased_60.csv")

In [9]:
data = data.rename(columns={'abstract':'text'})

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,Genomic stress and impaired DNA repair in Alzh...,Alzheimer disease (AD) is the most prominent f...,0
1,1,Exploring Psychosis in Neurodegenerative Demen...,The following commentary discusses a review by...,0
2,2,Examining a Preclinical Alzheimer's Cognitive ...,The preclinical Alzheimer's cognitive composit...,0
3,3,Gene Association Study of the Urokinase Plasmi...,The role of the innate immune system has long ...,0
4,4,Psychosis in Neurodegenerative Dementias: A Sy...,"Psychosis, characterized by delusions and/or h...",0


In [11]:
def preprocess_text(text: str) -> str:
    # Replace "<br /><br />|\." with space, and "\n" with space
    cleaned_text = re.sub(r"<br /><br />|\.", " ", text)
    cleaned_text = re.sub(r"\n", " ", cleaned_text)

    cleaned_text = re.sub(r"[.]", "", cleaned_text.lower()) 

    # Split the text into lines based on HTML tag for line break or period
    lines = re.split(r"<br /><br />|\.", cleaned_text)

    # Split each line into words using whitespace
    tokens = [word for line in lines for word in line.split(" ")]

    # Lowercase and remove non-alphanumeric characters from tokens for normalization
    normalized_tokens = [re.sub(r"\W+", "", token.lower()) for token in tokens]

    # Join the normalized tokens, excluding stopwords and single-character tokens
    result = " ".join([
        token
        for token in normalized_tokens
        if token and token not in all_stopwords and len(token) > 1
    ])

    return result


In [12]:
data['text'] = data['text'].apply(preprocess_text)

In [16]:
X = data["text"]
y = data["label"]

In [17]:
le = LabelEncoder()
y = le.fit_transform(y)

In [18]:
le.classes_

array([0, 1])

In [19]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(X)

<!-- model evaluation -->

In [20]:
sgd_classifier = SGDClassifier()
sgd_classifier_fit = sgd_classifier.fit(X, y)

In [21]:
model = sgd_classifier
k_fold = 5

In [22]:
predictions = cross_val_predict(model, X, y, cv=k_fold)

In [23]:
ac = cross_val_score(model, X, y, cv=k_fold, scoring='accuracy').mean()
cm = confusion_matrix(y, predictions)
cr = classification_report(y, predictions)

In [24]:
print("Accuracy =", round(ac, 3)*100, '%')

Accuracy = 95.0 %


<!-- save model -->

In [25]:
X = data["text"]

In [26]:
data_list = []

for text in X:
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

In [27]:
pipe = Pipeline([('vectorizer', tfidf_vectorizer), ('sgd', sgd_classifier)])
pipe.fit(X, y)

Pipeline(steps=[('vectorizer', TfidfVectorizer()), ('sgd', SGDClassifier())])

In [28]:
y = pipe.predict(X)

In [30]:
print("Accuracy =", round(ac, 3)*100, '%')

Accuracy = 95.0 %


### Saving the ML Model (Serialization) - Imp 

In [None]:
with open('svm_model.pickle','wb') as f:
    pickle.dump(pipe, f)

### Model Inference using Serialized file

In [143]:
model_pk = pickle.load(open('lang_trained_pipeline.pkl','rb'))

In [144]:
text = "this is a message to test"

detect = model_pk.predict([text])

<!-- Please make sure the output is exactly the same format as array([0]) or array([1]) -->

In [145]:
detect

array([0])

In [146]:
le.classes_[detect[0]], detect

(0, array([0]))