# Importation

In [None]:
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import string

### Séparation test et entrainement

In [None]:
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# train_df.to_csv('../data/train.csv', index=False)
# test_df.to_csv('../data/test.csv', index=False)


### Ouvrir les csv qui ont déjà été divisés

In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

# Variables

In [None]:
x_train = train_df['recette']
y_train = train_df['type']
y_test = test_df['type']



# Baseline 

### Prédiction aléatoire

In [None]:
def random_prediction():
    return random.choice(['Plat principal', 'Entrée', 'Dessert'])

test_df['random prediction'] = test_df.apply(lambda x: random_prediction(), axis=1)


### Prédiction classe majoritaire

In [None]:
def main_class_prediction(train_df):
    main_type = train_df['type'].mode()    
    return main_type

main_type = main_class_prediction(train_df)
test_df['main class prediction'] = test_df.apply(lambda x: main_type, axis=1)

# Normalisation

In [None]:
# Télécharger les ressources nécessaires pour nltk (la première fois seulement)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def normalize_text(text):
    # Conversion en minuscules
    text = text.lower()
    # Suppression de la ponctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    tokens = word_tokenize(text)
    # Suppression des mots vides
    stop_words = set(stopwords.words('french'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Rejoindre les tokens en une seule chaîne de caractères
    normalized_text = ' '.join(tokens)
    return normalized_text

# Appliquer la normalisation à une colonne du DataFrame
train_df['recette_normalized'] = train_df['recette'].apply(normalize_text)

# Model

### MultinomialNB

In [None]:
def MultinomialNB_classifier(x_train, y_train, test_df):
    
    model = make_pipeline(TfidfVectorizer(), MultinomialNB())

    model.fit(x_train, y_train)

    y_pred = model.predict(test_df['recette'])

    test_df['Tfi-Df prediction'] = y_pred

    return y_pred

y_pred_MultNB = MultinomialNB_classifier(x_train, y_train, test_df)

### Avec normalisation

In [None]:
y_pred_MultNB_normalized = MultinomialNB_classifier(train_df['recette_normalized'], y_train, test_df)

### Random Forest

In [None]:
def RandomForest(x_train, y_train, test_df):
    model = make_pipeline(TfidfVectorizer(), RandomForestClassifier())

    model.fit(x_train, y_train)

    y_pred = model.predict(test_df['recette'])

    test_df['RandomForest Prediction'] = y_pred

    return y_pred

y_pred_rf = RandomForest(x_train, y_train, test_df)


### Réseau de neuronnes

In [None]:
# Prétraitement des données
vectorizer = CountVectorizer()
x_train_counts = vectorizer.fit_transform(x_train)
x_test_counts = vectorizer.transform(test_df['recette'])

# Convertir les étiquettes en format numérique
label_to_idx = {'Entrée': 0, 'Plat principal': 1, 'Dessert': 2}
y_train_idx = [label_to_idx[label] for label in y_train]

# Définition du modèle
class RecipeClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RecipeClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

input_size = x_train_counts.shape[1]
hidden_size = 100
num_classes = 3
model = RecipeClassifier(input_size, hidden_size, num_classes)

# Définition de la fonction de perte et de l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entraînement du modèle
num_epochs = 10
for epoch in range(num_epochs):
    inputs = torch.tensor(x_train_counts.toarray(), dtype=torch.float32)
    labels = torch.tensor(y_train_idx, dtype=torch.long)
    
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print("Epoch [", epoch+1, "/", num_epochs, "]")

# Évaluation du modèle
with torch.no_grad():
    inputs = torch.tensor(x_test_counts.toarray(), dtype=torch.float32)
    outputs = model(inputs)
    _, predicted = torch.max(outputs, 1)
    predicted_labels = [list(label_to_idx.keys())[list(label_to_idx.values()).index(idx)] for idx in predicted]

test_df['Neural Network prediction'] = predicted_labels


### SVC

In [None]:
def SVC_classifier(x_train, y_train, test_df):

    model = make_pipeline(TfidfVectorizer(), SVC())

    model.fit(x_train, y_train)

    y_pred = model.predict(test_df['recette'])

    test_df['SVC prediction'] = y_pred

    return y_pred

y_pred_SVC = SVC_classifier(x_train, y_train, test_df)


# Résultat

In [None]:
def results(y_model, y_test):

    conf_matrix = confusion_matrix(y_test, y_model)

    conf_df = pd.DataFrame(conf_matrix)

    print("Matrice de Confusion : \n")

    plt.figure(figsize=(8,6))
    sns.heatmap(conf_df, annot=True, cmap="YlGnBu", fmt='g')
    plt.title('Matrice de Confusion')
    plt.xlabel('Prédictions')
    plt.ylabel('Valeurs Réelles')
    plt.show()

    # Rappel - Precision - F1-score

    report = classification_report(y_test, y_model)
    print(report)

### Random

In [None]:
y_rand = test_df['random prediction']
results(y_rand, y_test)

### Main

In [None]:
y_main = test_df['main class prediction']
results(y_main, y_test)

### MultinobinialeNB

In [None]:
results(y_pred_MultNB, y_test)

### MultinomialNB avec texte normalisé

In [None]:
results(y_pred_MultNB_normalized, y_test)

### Comparaison entre normalisé et non normalisé pour MultinomialNB

In [None]:
print("Results for y_pred_MultNB:")
print(classification_report(y_test, y_pred_MultNB))

print("Results for y_pred_MultNB_normalized:")
print(classification_report(y_test, y_pred_MultNB_normalized))


### Random Forest

In [None]:
results(y_pred_rf, y_test)

### SVC

In [None]:
results(y_pred_SVC, y_test)

### Réseau de neurones

In [None]:
results(predicted_labels, y_test)