# Notebook Approche Non Supervisée

In [1]:
import mlflow
import mlflow.sklearn
import pandas as pd
import spacy
from sklearn.metrics import accuracy_score

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_params):
    with mlflow.start_run():
        # Paramètres du modèle
        for key, value in model_params.items():
            mlflow.log_param(key, value)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, "model")


def train_and_evaluate(X, y):
    mlflow.start_run()
    param_max_depth = 10
    param_n_estimators = 100

    # Enregistrez les paramètres
    mlflow.log_param("max_depth", param_max_depth)
    mlflow.log_param("n_estimators", param_n_estimators)

    model = RandomForestClassifier(max_depth=param_max_depth, n_estimators=param_n_estimators)
    model.fit(X, y)
    y_pred = model.predict(X)

    accuracy = accuracy_score(y, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(model, "model")
    mlflow.end_run()


In [2]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("dataset.csv")
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

X_train = train_data[['title', 'body']]
y_train = train_data['tags']

X_test = test_data[['title', 'body']]
y_test = test_data['tags']

## Lemmatisation Title et Body

In [3]:
nlp = spacy.load("en_core_web_sm")

X_train['title_lemmatized'] = X_train['title'].apply(lambda text: ' '.join([token.lemma_ for token in nlp(text)]))

X_train['body_lemmatized'] = X_train['body'].apply(lambda text: ' '.join([token.lemma_ for token in nlp(text)]))

X_test['title_lemmatized'] = X_test['title'].apply(lambda text: ' '.join([token.lemma_ for token in nlp(text)]))

X_test['body_lemmatized'] = X_test['body'].apply(lambda text: ' '.join([token.lemma_ for token in nlp(text)]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['title_lemmatized'] = X_train['title'].apply(lambda text: ' '.join([token.lemma_ for token in nlp(text)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['title_lemmatized'] = X_test['title'].apply(lambda text: ' '.join([token.lemma_ for token in nlp(text)]))


## CountVectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_title = CountVectorizer()
X_title_train = vectorizer_title.fit_transform(X_train['title_lemmatized'])
X_title_test = vectorizer_title.transform(X_test['title_lemmatized'])

vectorizer_body = CountVectorizer()
X_body_train = vectorizer_body.fit_transform(X_train['body_lemmatized'])
X_body_test = vectorizer_body.transform(X_test['body_lemmatized'])

feature_names_title = vectorizer_title.get_feature_names_out()
word_lists_title = [feature_names_title[idx].split() for idx in X_title_train.nonzero()[1]]

print(X_title_train)

  (0, 245)	2
  (0, 316)	1
  (0, 33)	1
  (0, 170)	1
  (0, 203)	1
  (0, 27)	1
  (0, 256)	1
  (0, 200)	1
  (0, 5)	2
  (0, 291)	1
  (0, 347)	1
  (0, 138)	1
  (0, 118)	1
  (0, 60)	1
  (0, 146)	1
  (0, 241)	1
  (1, 118)	1
  (1, 145)	1
  (1, 188)	1
  (1, 68)	1
  (1, 156)	2
  (1, 215)	1
  (1, 13)	1
  (1, 57)	1
  (1, 207)	1
  :	:
  (163, 219)	1
  (163, 155)	1
  (164, 146)	1
  (164, 12)	1
  (164, 142)	1
  (164, 89)	1
  (164, 132)	1
  (164, 234)	1
  (164, 20)	1
  (164, 301)	1
  (164, 111)	1
  (164, 246)	1
  (164, 240)	1
  (165, 208)	1
  (165, 302)	2
  (165, 40)	1
  (165, 206)	1
  (165, 24)	1
  (165, 305)	1
  (165, 128)	1
  (166, 309)	1
  (166, 142)	1
  (166, 88)	1
  (166, 338)	1
  (166, 43)	1


from scipy.sparse import vstack

# Concaténer title et body
X_combined_train = vstack([X_title_train, X_body_train])
X_combined_test = vstack([X_title_test, X_body_test])


In [5]:
print(X_title_train)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model_params = {'n_estimators': 100, 'max_depth': 10}
X_train, X_test, y_train, y_test = train_test_split()
train_and_evaluate_model(model, X_title_train, y_train, X_test, y_test, model_params)

#train_and_evaluate(X_title_train, y_train)


  (0, 245)	2
  (0, 316)	1
  (0, 33)	1
  (0, 170)	1
  (0, 203)	1
  (0, 27)	1
  (0, 256)	1
  (0, 200)	1
  (0, 5)	2
  (0, 291)	1
  (0, 347)	1
  (0, 138)	1
  (0, 118)	1
  (0, 60)	1
  (0, 146)	1
  (0, 241)	1
  (1, 118)	1
  (1, 145)	1
  (1, 188)	1
  (1, 68)	1
  (1, 156)	2
  (1, 215)	1
  (1, 13)	1
  (1, 57)	1
  (1, 207)	1
  :	:
  (163, 219)	1
  (163, 155)	1
  (164, 146)	1
  (164, 12)	1
  (164, 142)	1
  (164, 89)	1
  (164, 132)	1
  (164, 234)	1
  (164, 20)	1
  (164, 301)	1
  (164, 111)	1
  (164, 246)	1
  (164, 240)	1
  (165, 208)	1
  (165, 302)	2
  (165, 40)	1
  (165, 206)	1
  (165, 24)	1
  (165, 305)	1
  (165, 128)	1
  (166, 309)	1
  (166, 142)	1
  (166, 88)	1
  (166, 338)	1
  (166, 43)	1




ValueError: could not convert string to float: 'Removing duplicates in lists'