In [1]:
# Importation des librairies nécessaires
import pandas as pd
import numpy as np
import re
import pickle


# Importation de Scikit-learn pour les modèles et métriques
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Importation des modules de traitement de texte NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


# Définition du chemin des données
path_data = '/Users/chretien/OpenClassroom/Openclassroom7/'


# 1. Import Data 

In [2]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", sep=',', encoding='ISO-8859-1', header=None,names=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head(5)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# 2 clean 

In [3]:
# Garder uniquement colonnes target et text 
df = df[['target', 'text']]

# Remplacer target 4 par 1
df["target"] = df["target"].replace(4, 1)

# Sample
df_sample = df.groupby('target', as_index=False).apply(lambda x : x.sample(frac=0.001))

In [4]:
# Tokenizer

def tokenizer_fct(sentence) :
    word_tokens = word_tokenize(sentence)
    # print(word_tokens)
    return word_tokens


# Tokenizer split

def tokenizer_split_fct(sentence) :
    word_tokens = sentence.split(' ')
    # print(word_tokens)
    return word_tokens

# Stop words
from nltk.corpus import stopwords
stop_w = list(set(stopwords.words('english')))

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
    # print(filtered_w)    
    return filtered_w

# lower case et alpha (not "@")
def lower_alpha_fct(list_words) :
    fw = [w.lower() for w in list_words if w.isalpha()]
    # print(fw)
    return fw

# lower case et alpha (not "@")
def lower_not_user_fct(list_words) :
    fw = [w.lower() for w in list_words if not w.startswith("@")]
    # print(fw
    return fw




#------------------------------Lemmatizer-----------------------------------


def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w
    
#------------------------------Stemming-----------------------------------


def stemma_fct(list_words) :
    stemming = PorterStemmer()
    stemma_w = [stemming.stem(w) for w in list_words]
    return stemma_w


#-------------------# Fonction de préparation des tweets----------------------------


# Fonction de préparation des questions
def transform_text(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    #lem_w = lemma_fct(lw)
    filtered_w = stop_word_filter_fct(lw)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence


# Fonction de préparation des questions
def transform_text_lem(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    lem_w = lemma_fct(lw)
    filtered_w = stop_word_filter_fct(lem_w)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence


# Fonction de préparation des questions
def transform_text_stemma(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    stemma_w = stemma_fct(lw)
    filtered_w = stop_word_filter_fct(stemma_w)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence



In [5]:
# afficher DataFrame clean 

tweets = pd.DataFrame()
tweets['target'] = df_sample['target']
tweets['text'] = df_sample['text'].apply(lambda x : transform_text(x))
tweets['text_lemma'] = df_sample['text'].apply(lambda x : transform_text_lem(x))
tweets['text_stemm'] = df_sample['text'].apply(lambda x : transform_text_stemma(x))
tweets

Unnamed: 0,Unnamed: 1,target,text,text_lemma,text_stemm
0,359722,0,neo_kryptik get-together = ?... yes &quot;when...,neo_kryptik get-together = ?... yes &quot;when...,neo_kryptik get-togeth = ?... ye &quot;when n ...
0,230049,0,half? poo. tomorrow shit-hits-the-fan monday,half? poo. tomorrow shit-hits-the-fan monday,half? poo. tomorrow shit-hits-the-fan monday
0,580559,0,thats libra thing? lol thought violent. miss g...,thats libra thing? lol thought violent. miss g...,libra thing? lol thought violent. miss girl
0,766572,0,suck real world right now!,suck real world right now!,suck real world right now!
0,74127,0,wondering today!? shame weather,wondering today!? shame weather,wonder today!? shame weather
...,...,...,...,...,...
1,1073638,1,hey you! vote luu,hey you! vote luu,hey you! vote luu
1,1538753,1,yes are! (take live) tour seems fab well can'...,yes are! (take live) tour seems fab well can'...,ye are! (take live) tour seem fab well can't ...
1,1332997,1,count lucky! mine doesn't. (they added lat...,count lucky! mine doesn't. (they added lat...,count lucky! mine doesn't. (they ad later ...
1,1381741,1,"'n runnin, 2day's grannny's b-day! cool god ke...","'n runnin, 2day's grannny's b-day! cool god ke...","'n runnin, 2day' grannny' b-day! cool god kept..."


# 3. Split 

In [6]:
# Split Train test 
train0, test = train_test_split(tweets, test_size=0.2, random_state=42, shuffle=True)

# Split Train Val 
train, val = train_test_split(train0, test_size=0.25, random_state=42, shuffle=True)

# SPlit target 
y_train = train['target']
y_val = val['target']
y_test = test['target']

# 4. TF_IDF 

In [7]:
import mlflow
import mlflow.sklearn
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Configurez l'URI de suivi
mlflow.set_tracking_uri("http://localhost:5003")

# Créer ou définir une expérience
experiment_name = "Logistic_Regression_Experiment"
mlflow.set_experiment(experiment_name)

# Fonction pour entraîner et évaluer le modèle
def train_model_logistic_regression(train, val):
    # Vectorisation des tweets
    Vec = TfidfVectorizer()
    
    # Ajuster et transformer les données d'entraînement
    X_train_vec = Vec.fit_transform(train['text']) 
    X_val_vec = Vec.transform(val['text'])

    # Démarrer un run avec MLflow
    with mlflow.start_run(run_name="Logistic_Regression_TF_IDF"):
        
        # Modèle
        model = LogisticRegression()
        model.fit(X_train_vec, train['target'])
        
        # Prédiction et évaluation du modèle sur l'ensemble de validation
        y_val_pred = model.predict(X_val_vec)

        # Calcul des métriques
        accuracy_TF_IDF = accuracy_score(val['target'], y_val_pred)
        auc_score_TF_IDF = roc_auc_score(val['target'], y_val_pred)

        # Loguer les métriques dans MLflow
        mlflow.log_metric("Accuracy", accuracy_TF_IDF)
        mlflow.log_metric("AUC", auc_score_TF_IDF)

        # Sauvegarder le modèle et le vectoriseur
        model_path = 'logistic_regression_model.pkl'
        vectorizer_path = 'tfidf_vectorizer.pkl'

        with open(model_path, 'wb') as f_model:
            pickle.dump(model, f_model)
        
        with open(vectorizer_path, 'wb') as f_vectorizer:
            pickle.dump(Vec, f_vectorizer)

        # Enregistrer le modèle avec MLflow
        mlflow.log_artifact(model_path)
        mlflow.log_artifact(vectorizer_path)

# Exécuter la fonction pour entraîner le modèle
train_model_logistic_regression(train, val)
