# Importation

In [None]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Séparation test et entrainement

In [None]:
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# train_df.to_csv('../data/train.csv', index=False)
# test_df.to_csv('../data/test.csv', index=False)


### Ouvrir les csv qui ont déjà été divisés

In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

# Variables

In [None]:
x_train = train_df['recette']
y_train = train_df['type']
y_test = test_df['type']



# Baseline 

### Prédiction aléatoire

In [None]:
def random_prediction():
    return random.choice(['Plat principal', 'Entrée', 'Dessert'])

test_df['random prediction'] = test_df.apply(lambda x: random_prediction(), axis=1)


### Prédiction classe majoritaire

In [None]:
def main_class_prediction(train_df):
    main_type = train_df['type'].mode()    
    return main_type

main_type = main_class_prediction(train_df)
test_df['main class prediction'] = test_df.apply(lambda x: main_type, axis=1)

# Normalisation

In [None]:
# Télécharger les ressources nécessaires pour nltk (la première fois seulement)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def normalize_text(text):
    # Conversion en minuscules
    text = text.lower()
    # Suppression de la ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    tokens = word_tokenize(text)
    # Suppression des mots vides
    stop_words = set(stopwords.words('french'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Rejoindre les tokens en une seule chaîne de caractères
    normalized_text = ' '.join(tokens)
    return normalized_text

# Appliquer la normalisation à une colonne du DataFrame
train_df['recette_normalized'] = train_df['recette'].apply(normalize_text)

# Model

### SKlearn + Tfi-Df

In [None]:
def TfiDf_Sklearn(x_train, y_train, test_df):
    
    model1 = make_pipeline(TfidfVectorizer(), MultinomialNB())

    model1.fit(x_train, y_train)

    y_pred_Sklearn = model1.predict(test_df['recette'])

    test_df['Tfi-Df prediction'] = y_pred_Sklearn

    return y_pred_Sklearn

y_pred_Sklearn = TfiDf_Sklearn(x_train, y_train, test_df)

### Avec normalisation

In [None]:
y_pred_Sklearn_normalized = TfiDf_Sklearn(train_df['recette_normalized'], y_train, test_df)

### Spacy

In [None]:
def spacyclassifier(x_train, y_train, test_df):
    model2 = make_pipeline()

    model2.fit(x_train, y_train)

    y_pred_spacy = model2.predict(test_df['recette'])

    test_df['spacy prediction'] = y_pred_spacy

    return y_pred_spacy

# Résultat

In [None]:
def results(y_model, y_test):

    conf_matrix = confusion_matrix(y_test, y_model)

    conf_df = pd.DataFrame(conf_matrix)

    print("Matrice de Confusion : \n")

    plt.figure(figsize=(8,6))
    sns.heatmap(conf_df, annot=True, cmap="YlGnBu", fmt='g')
    plt.title('Matrice de Confusion')
    plt.xlabel('Prédictions')
    plt.ylabel('Valeurs Réelles')
    plt.show()

    # Rappel - Precision - F1-score

    report = classification_report(y_test, y_model)
    print(report)

## Random

In [None]:
y_rand = test_df['random prediction']
results(y_rand, y_test)

## Main

In [None]:
y_main = test_df['main class prediction']
results(y_main, y_test)

### Sklearn + Tfi-DF

In [None]:
results(y_pred_Sklearn, y_test)

### Sklearn + Tfi-Df avec texte normalisé

In [None]:
results(y_pred_Sklearn_normalized, y_test)

In [None]:
from sklearn.metrics import classification_report

print("Results for y_pred_Sklearn:")
print(classification_report(y_test, y_pred_Sklearn))

print("Results for y_pred_Sklearn_normalized:")
print(classification_report(y_test, y_pred_Sklearn_normalized))
