In [None]:
import os
from google.colab import drive

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
os.chdir("/content/drive/My Drive/Observation")

In [1]:
# import important modules
import numpy as np
import pandas as pd
# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB # classifier 
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    plot_confusion_matrix,
)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# text preprocessing modules
from string import punctuation 
# text preprocessing modules
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression
# Download dependency
for dependency in (
    "brown",
    "names",
    "punkt",
    "wordnet",
    "stopwords",
    "averaged_perceptron_tagger",
    "universal_tagset",
):
    nltk.download(dependency)
    
import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\yveri\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\yveri\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yveri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yveri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yveri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yveri\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Dow

In [4]:
cwd = os.getcwd() 
cwd + '\\dataset.csv'

'c:\\Users\\yveri\\Documents\\ML_FastApi_Docker\\models\\dataset.csv'

In [2]:
df = pd.read_csv("dataset.csv", sep='\t', header=0, index_col=None)
df.head()

Unnamed: 0,text,label
0,Salut !\n\nAlors effectivement c’est un pavé m...,ados
1,Bonjour\nJe ne suis pas maman solo mais qui su...,adult
2,"Ho mon dieu il met des "" ^^ "" D: Je te souhait...",ados
3,"Ouais, je comprends...",ados
4,Folle de toi...,ados


In [None]:
stop_words =  stopwords.words('french')
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [None]:
df["text_clean"] = df["text"].apply(text_cleaning)
df.head()

Unnamed: 0,text,label,text_clean
0,Salut !\n\nAlors effectivement c’est un pavé m...,ados,Salut Alors effectivement pav pense besoin De ...
1,Bonjour\nJe ne suis pas maman solo mais qui su...,adult,Bonjour Je maman solo juger Tu inquiettes fill...
2,"Ho mon dieu il met des "" ^^ "" D: Je te souhait...",ados,Ho dieu met D Je souhaite br ler enfer H R TIQ...
3,"Ouais, je comprends...",ados,Ouais comprends
4,Folle de toi...,ados,Folle


In [None]:
#split features and target from  data 
X = df["text_clean"]
y = df['label'].apply(lambda x: "0" if x== "ados" else 1).values
y = np.array(y,dtype=int)

In [None]:
# split data into train and validate
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [None]:
# Create a classifier in pipeline
age_classifier = Pipeline(steps=[
                               ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('naive_bayes',MultinomialNB())
                                 ])

In [None]:
# train the sentiment classifier 
age_classifier.fit(X_train,y_train)

Pipeline(steps=[('pre_processing', TfidfVectorizer(lowercase=False)),
                ('naive_bayes', MultinomialNB())])

In [None]:
# test model performance on valid data 
y_preds = age_classifier.predict(X_valid)

In [None]:
accuracy_score(y_valid,y_preds)

0.856

In [None]:
#save model 
import joblib 
joblib.dump(age_classifier, 'age_classifier_model_pipeline.pkl')

['age_classifier_model_pipeline.pkl']