# Import

In [133]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

from nltk.tokenize import word_tokenize

from tqdm import tqdm
import nltk
import re
import string

from nltk.stem import WordNetLemmatizer

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Vectorizer
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM, Flatten
from tensorflow.keras.optimizers import Adam

In [134]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Dataframe

In [135]:
df = pd.read_csv("bbc-news-data.csv", sep='\t')

In [136]:
df['total_content']=df['title'] + ' ' + df['content']
df = df.drop(columns=['filename','title','content'])
df.head()

Unnamed: 0,category,total_content
0,business,Ad sales boost Time Warner profit Quarterly p...
1,business,Dollar gains on Greenspan speech The dollar h...
2,business,Yukos unit buyer faces loan claim The owners ...
3,business,High fuel prices hit BA's profits British Air...
4,business,Pernod takeover talk lifts Domecq Shares in U...


In [137]:
df = df.map(lambda line : line.lower())
df.head()

Unnamed: 0,category,total_content
0,business,ad sales boost time warner profit quarterly p...
1,business,dollar gains on greenspan speech the dollar h...
2,business,yukos unit buyer faces loan claim the owners ...
3,business,high fuel prices hit ba's profits british air...
4,business,pernod takeover talk lifts domecq shares in u...


# Tokenization

In [138]:
# Listes pour stocker les tokens et les variétés
content_token = []
category_token = []

# Parcours des descriptions
for i, text_description in enumerate(tqdm(df["total_content"], desc="Tokenizing")):
    words_token = word_tokenize(text_description)
    content_token.append(words_token)
    category_token.append(df.loc[i, "category"])  # récupère la variété correspondante

# Création du DataFrame final
df_tokenise = pd.DataFrame({
    "total_content": content_token,
    "category": category_token
})

Tokenizing: 100%|██████████| 2225/2225 [00:05<00:00, 386.13it/s]


In [139]:
def clean_tokens(tokens):
    cleaned = [re.sub(f"[{string.punctuation}0-9]", "", w) for w in tokens]
    # Supprimer les tokens vides après nettoyage
    return [w for w in cleaned if w != ""]

# Appliquer sur la colonne
df_tokenise['total_content_clean'] = df_tokenise['total_content'].apply(clean_tokens)

In [140]:
df_tokenise.head()

Unnamed: 0,total_content,category,total_content_clean
0,"[ad, sales, boost, time, warner, profit, quart...",business,"[ad, sales, boost, time, warner, profit, quart..."
1,"[dollar, gains, on, greenspan, speech, the, do...",business,"[dollar, gains, on, greenspan, speech, the, do..."
2,"[yukos, unit, buyer, faces, loan, claim, the, ...",business,"[yukos, unit, buyer, faces, loan, claim, the, ..."
3,"[high, fuel, prices, hit, ba, 's, profits, bri...",business,"[high, fuel, prices, hit, ba, s, profits, brit..."
4,"[pernod, takeover, talk, lifts, domecq, shares...",business,"[pernod, takeover, talk, lifts, domecq, shares..."


# Lemmatization

In [141]:
lemmatizer = WordNetLemmatizer()

varieties_lemmat = []
description_lemmat = []

for i, list_token in enumerate(tqdm(df_tokenise["total_content"], desc="Lemmatizating")):
    # loop for stemming each word
    description_lemmat.append([lemmatizer.lemmatize(word) for word in list_token])

    varieties_lemmat.append(df_tokenise.loc[i, "category"])  # variety

# Création du DataFrame final
df_lemmat = pd.DataFrame({
    "total_content": description_lemmat,
    "category": varieties_lemmat
})

Lemmatizating: 100%|██████████| 2225/2225 [00:03<00:00, 564.16it/s]


In [142]:
df_lemmat.head()

Unnamed: 0,total_content,category
0,"[ad, sale, boost, time, warner, profit, quarte...",business
1,"[dollar, gain, on, greenspan, speech, the, dol...",business
2,"[yukos, unit, buyer, face, loan, claim, the, o...",business
3,"[high, fuel, price, hit, ba, 's, profit, briti...",business
4,"[pernod, takeover, talk, lift, domecq, share, ...",business


In [143]:
nltk.download('stopwords')
from nltk.corpus import stopwords

#List of stop words
stop_words = set(stopwords.words('english'))

#Delete stopwords
df_lemmat['total_content'] = df_lemmat['total_content'].apply(
    lambda x: [word for word in x if word not in stop_words]
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [144]:
df_lemmat.head()

Unnamed: 0,total_content,category
0,"[ad, sale, boost, time, warner, profit, quarte...",business
1,"[dollar, gain, greenspan, speech, dollar, ha, ...",business
2,"[yukos, unit, buyer, face, loan, claim, owner,...",business
3,"[high, fuel, price, hit, ba, 's, profit, briti...",business
4,"[pernod, takeover, talk, lift, domecq, share, ...",business


In [145]:
# Transformer les labels en entiers
le = LabelEncoder()
y = le.fit_transform(df['category'])
num_classes = len(np.unique(y))

In [146]:
# Transforme en liste de chaine de caractere pour la vectorization
df_lemmat["total_content"] = df_lemmat["total_content"].apply(
    lambda x : " ".join(x)
)

In [147]:
liste_combi = []
list_acc = []

# TFIDF

In [148]:
vectorizer = TfidfVectorizer()
df_vector = vectorizer.fit_transform(df_lemmat["total_content"])

In [149]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(df_vector, y, test_size=0.3, random_state=42)

### LinearSVC

In [150]:
model_SVM = SVC(kernel='linear')  # kernel linéaire pour texte
model_SVM.fit(X_train, y_train)

In [151]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Linear SVC")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### Logistic Regression

In [152]:
model_LogisticRegression = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
model_LogisticRegression.fit(X_train, y_train)


In [153]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Logistic Regression")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### MLP (NN)

In [154]:
model_MLP = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='logistic',
    solver='adam',
    max_iter=300,
    random_state=42
)
model_MLP.fit(X_train, y_train)

In [155]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + MLP(NN)")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### Dense model (Keras fully connected NN)

In [156]:
model_Dense = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')  # softmax pour classification multi-classes
])

model_Dense.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model_Dense.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - accuracy: 0.4799 - loss: 1.4939
Epoch 2/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 54ms/step - accuracy: 0.9758 - loss: 0.3832
Epoch 3/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - accuracy: 0.9993 - loss: 0.0351
Epoch 4/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 1.0000 - loss: 0.0118
Epoch 5/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 1.0000 - loss: 0.0055
Epoch 6/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 1.0000 - loss: 0.0033
Epoch 7/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 58ms/step - accuracy: 1.0000 - loss: 0.0025
Epoch 8/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 1.0000 - loss: 0.0017
Epoch 9/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x7c783e43e330>

In [157]:
# --- Prédiction ---
y_pred_probs = model_Dense.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Dense model (Keras fully connected NN")
list_acc.append(acc)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.95      0.96       165
entertainment       0.97      0.98      0.98       118
     politics       0.97      0.97      0.97       120
        sport       0.99      0.99      0.99       140
         tech       0.96      0.99      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



# Count Vectorizer

In [158]:
# Vectorisation (sac de mots)
vectorizer = CountVectorizer()
df_vector = vectorizer.fit_transform(df_lemmat["total_content"])

In [159]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(df_vector, y, test_size=0.3, random_state=42)

### Logistic Regression

In [160]:
# Modèle
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [161]:
# --- Prédiction ---
y_pred = lr_model.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("Count Vectorizer + Logistic Regression")
list_acc.append(acc)

Accuracy: 0.9700598802395209

Classification Report:
                precision    recall  f1-score   support

     business       0.96      0.95      0.95       165
entertainment       0.96      0.99      0.97       118
     politics       0.96      0.96      0.96       120
        sport       0.97      1.00      0.99       140
         tech       1.00      0.96      0.98       125

     accuracy                           0.97       668
    macro avg       0.97      0.97      0.97       668
 weighted avg       0.97      0.97      0.97       668



### Dense model (Keras fully connected NN)

In [162]:
model_Dense = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')  # softmax pour classification multi-classes
])

model_Dense.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Convertir en dense
X_train_dense = X_train.toarray()

model_Dense.fit(X_train_dense, y_train, epochs=10, batch_size=32, verbose=1)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.7368 - loss: 0.9033
Epoch 2/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.9956 - loss: 0.0215
Epoch 3/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.9978 - loss: 0.0089
Epoch 4/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 1.0000 - loss: 0.0017
Epoch 5/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 62ms/step - accuracy: 1.0000 - loss: 8.5304e-04
Epoch 6/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 1.0000 - loss: 6.6925e-04
Epoch 7/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - accuracy: 1.0000 - loss: 5.2725e-04
Epoch 8/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 1.0000 - loss: 3.7920e-04
Epoch 9/10
[1m49/49[0m [32m━━

<keras.src.callbacks.history.History at 0x7c784ca9d970>

In [163]:
# --- Prédiction ---
X_test_dense = X_test.toarray()

y_pred_probs = model_Dense.predict(X_test_dense)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("Count Vectorizer + Dense model (Keras fully connected NN")
list_acc.append(acc)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Accuracy: 0.9730538922155688

Classification Report:
                precision    recall  f1-score   support

     business       0.96      0.96      0.96       165
entertainment       0.97      0.98      0.98       118
     politics       0.96      0.96      0.96       120
        sport       0.99      0.99      0.99       140
         tech       0.99      0.97      0.98       125

     accuracy                           0.97       668
    macro avg       0.97      0.97      0.97       668
 weighted avg       0.97      0.97      0.97       668



# Word2Vec + BiLSTM

### Word2Vec

In [164]:
# Préparer les documents tokenisés (liste de mots par doc)
documents = [doc.split() for doc in df_lemmat["total_content"]]

In [165]:
w2v_model = Word2Vec(
    sentences=documents,
    vector_size=100,   # dimension des vecteurs
    window=5,
    min_count=1,
    workers=4,
    seed=42
)

In [166]:
# Créer un dictionnaire des mots
word_index = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

In [167]:
# Construire la matrice d'embeddings
embedding_matrix = np.zeros((len(word_index)+1, w2v_model.vector_size))
for word, i in word_index.items():
    embedding_matrix[i] = w2v_model.wv[word]

In [168]:
# Transformer documents en séquences d'indices
sequences = [[word_index.get(w, 0) for w in doc] for doc in documents]

In [169]:
# Padding pour que toutes les séquences aient la même longueur
max_len = 100  # longueur max des séquences
X_seq = pad_sequences(sequences, maxlen=max_len)

In [170]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y, test_size=0.3, random_state=42
)

### BiLSTM

In [171]:
model_BiLSTM = Sequential([
    Embedding(input_dim=len(word_index)+1,
              output_dim=w2v_model.vector_size,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),  # embeddings gelés
    Bidirectional(LSTM(64)),
    Dense(num_classes, activation='softmax')  # classification multiclass
])

model_BiLSTM.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',  # labels entiers
    metrics=['accuracy']
)

# Entraînement
model_BiLSTM.fit(X_train, y_train, validation_data=(X_test, y_test),
          epochs=10, batch_size=32, verbose=1)

Epoch 1/10




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 129ms/step - accuracy: 0.3157 - loss: 1.5048 - val_accuracy: 0.5614 - val_loss: 1.1890
Epoch 2/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 93ms/step - accuracy: 0.5392 - loss: 1.1731 - val_accuracy: 0.6003 - val_loss: 1.0369
Epoch 3/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 120ms/step - accuracy: 0.6584 - loss: 0.9430 - val_accuracy: 0.5629 - val_loss: 1.0813
Epoch 4/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 94ms/step - accuracy: 0.6215 - loss: 0.9624 - val_accuracy: 0.6901 - val_loss: 0.8117
Epoch 5/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 94ms/step - accuracy: 0.7026 - loss: 0.7696 - val_accuracy: 0.6811 - val_loss: 0.8434
Epoch 6/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 118ms/step - accuracy: 0.6858 - loss: 0.7648 - val_accuracy: 0.6437 - val_loss: 0.9118
Epoch 7/10
[1m49/49[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c782c824260>

In [172]:
# --- Prédiction ---
y_pred_probs = model_BiLSTM.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("Word2Vec + BiLSTM")
list_acc.append(acc)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step
Accuracy: 0.7649700598802395

Classification Report:
                precision    recall  f1-score   support

     business       0.80      0.82      0.81       165
entertainment       0.72      0.58      0.64       118
     politics       0.87      0.68      0.77       120
        sport       0.71      0.89      0.79       140
         tech       0.75      0.81      0.78       125

     accuracy                           0.76       668
    macro avg       0.77      0.76      0.76       668
 weighted avg       0.77      0.76      0.76       668



# Résultat

In [173]:
df_results = pd.DataFrame({
    "Model": liste_combi,
    "Accuracy": list_acc
})

df_results = df_results.sort_values(by="Accuracy", ascending=False).reset_index(drop=True)

In [174]:
df_results.set_index("Model").head(len(df_results))

Unnamed: 0_level_0,Accuracy
Model,Unnamed: 1_level_1
TF-IDF + Linear SVC,0.976048
TF-IDF + Logistic Regression,0.976048
TF-IDF + MLP(NN),0.976048
TF-IDF + Dense model (Keras fully connected NN,0.976048
Count Vectorizer + Dense model (Keras fully connected NN,0.973054
Count Vectorizer + Logistic Regression,0.97006
Word2Vec + BiLSTM,0.76497


# Reports – Results of the Combinations



### 1. Objective:

The objective of this lab was to test different combinations of vectorizers and classification models to predict news categories. We systematically tested TF-IDF, CountVectorizer, and Word2Vec as text representations, combined with several models: Logistic Regression, Linear SVM, scikit-learn MLP (NN), Dense Keras NN, and BiLSTM Keras.

### 2. Results :

| Model                                                    |   Accuracy |
|:---------------------------------------------------------|-----------:|
| TF-IDF + Linear SVC                                      |   0.976048 |
| TF-IDF + Logistic Regression                             |   0.976048 |
| TF-IDF + MLP(NN)                                         |   0.976048 |
| TF-IDF + Dense model (Keras fully connected NN)          |   0.976048 |
| Count Vectorizer + Dense model (Keras fully connected NN)|   0.973054 |
| Count Vectorizer + Logistic Regression                   |   0.97006  |
| Word2Vec + BiLSTM                                        |   0.76497  |

### 3. Analysis :
The results show that TF-IDF consistently performs best across all tested models, with Linear SVM, Logistic Regression, MLP, and Dense Keras NN all achieving the same accuracy of 0.9760. This indicates that for this dataset, the simplicity and effectiveness of TF-IDF are sufficient for strong classification performance.

The Count Vectorizer also performs well, particularly with a Dense Keras NN (0.973), but slightly below TF-IDF-based models, suggesting that while raw count representations are informative, TF-IDF provides a more discriminative feature set for news classification.

In contrast, Word2Vec combined with a BiLSTM significantly underperforms (0.765), highlighting that either the BiLSTM architecture is not optimal for this small dataset, or that Word2Vec embeddings alone do not capture enough semantic detail for this classification task.

Overall, adding dense layers in Keras does not yield a noticeable gain over simpler linear models when using TF-IDF, but may still be useful when experimenting with different embeddings like Word2Vec.

### 4. Preliminary Conclusion :
For this dataset and the tested configurations, TF-IDF combined with linear and Dense Keras models consistently achieves the highest performance with an accuracy of 0.976. Count Vectorizer also performs well, particularly with a Dense Keras NN (0.973), but slightly below TF-IDF. Word2Vec combined with a BiLSTM remains significantly less effective (0.765), suggesting that this architecture is not optimal for the current dataset and embedding setup.

Adding dense layers does not provide a substantial improvement over linear models when using TF-IDF, but Dense architectures could still be beneficial for alternative embeddings such as Word2Vec or pre-trained embeddings.

# Next experiments

Next steps will focus on exploring new combinations that may improve performance. Specifically, we will test Word2Vec embeddings with a Dense Keras model to evaluate whether simpler fully connected architectures can leverage Word2Vec more effectively than a BiLSTM.

Additionally, we will experiment with pre-trained FastText embeddings combined with Dense architectures, as FastText can better handle rare words, proper nouns, and morphological variations common in our news dataset.

These tests aim to provide a deeper understanding of how different embeddings interact with neural network architectures and to evaluate whether non-linear models can enhance generalization beyond TF-IDF and Count Vectorizer.

### Word2Vec + Dense Model

In [175]:
# Préparer les documents tokenisés (liste de mots par doc)
documents = [doc.split() for doc in df_lemmat["total_content"]]

In [176]:
w2v_model = Word2Vec(
    sentences=documents,
    vector_size=100,   # dimension des vecteurs
    window=5,
    min_count=1,
    workers=4,
    seed=42
)

In [177]:
# Créer un dictionnaire des mots
word_index = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

In [178]:
# Construire la matrice d'embeddings
embedding_matrix = np.zeros((len(word_index)+1, w2v_model.vector_size))
for word, i in word_index.items():
    embedding_matrix[i] = w2v_model.wv[word]

In [179]:
# Transformer documents en séquences d'indices
sequences = [[word_index.get(w, 0) for w in doc] for doc in documents]

In [180]:
# Padding pour que toutes les séquences aient la même longueur
max_len = 100  # longueur max des séquences
X_seq = pad_sequences(sequences, maxlen=max_len)

In [181]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y, test_size=0.3, random_state=42
)

In [182]:
model_Dense_W2V = Sequential([
    Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=X_train.shape[1],
        trainable=False  # mettre True si tu veux fine-tuner les vecteurs Word2Vec
    ),
    Flatten(),  # on aplati la séquence en un seul vecteur
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')
])

model_Dense_W2V.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model_Dense_W2V.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/10




[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.2663 - loss: 1.9910 - val_accuracy: 0.3077 - val_loss: 1.5267
Epoch 2/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.4071 - loss: 1.3800 - val_accuracy: 0.4231 - val_loss: 1.3945
Epoch 3/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.5274 - loss: 1.1532 - val_accuracy: 0.5256 - val_loss: 1.2417
Epoch 4/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.6445 - loss: 0.9335 - val_accuracy: 0.4968 - val_loss: 1.2833
Epoch 5/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.7122 - loss: 0.7973 - val_accuracy: 0.5160 - val_loss: 1.3294
Epoch 6/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.7759 - loss: 0.6110 - val_accuracy: 0.5321 - val_loss: 1.2728
Epoch 7/10
[1m39/39[0m [32m━━━━━━━━━━━━━━━

In [183]:
# --- Prédiction ---
y_pred_probs = model_Dense_W2V.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# --- Mettre à jour df_results ---
df_results = pd.concat([
    df_results,
    pd.DataFrame([{"Model": "Word2Vec + Dense --> Nouveau", "Accuracy": acc}])
], ignore_index=True)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Accuracy: 0.6212574850299402

Classification Report:
                precision    recall  f1-score   support

     business       0.65      0.71      0.68       165
entertainment       0.65      0.41      0.50       118
     politics       0.56      0.74      0.64       120
        sport       0.57      0.61      0.59       140
         tech       0.71      0.60      0.65       125

     accuracy                           0.62       668
    macro avg       0.63      0.61      0.61       668
 weighted avg       0.63      0.62      0.62       668



### FastText + Dense model

In [184]:
fasttext_model = FastText(sentences= df_lemmat["total_content"].tolist(), vector_size=100, window=5, min_count=1, workers=4, seed=42)



In [185]:
# Dictionnaire de mot
word_index = {word: i+1 for i, word in enumerate(fasttext_model.wv.index_to_key)}

In [186]:
# Matrice d'embedding
embedding_matrix = np.zeros((len(word_index)+1, fasttext_model.vector_size))
for word, i in word_index.items():
    embedding_matrix[i] = fasttext_model.wv[word]

In [187]:
# Transforme documents en séquences d'indices
sequences = [[word_index.get(w, 0) for w in doc] for doc in documents]
max_len = 100
X_seq = pad_sequences(sequences, maxlen=max_len)

In [188]:
X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=0.3, random_state=42)

In [189]:
model_FastText_Dense = Sequential([
    Embedding(input_dim=embedding_matrix.shape[0],
              output_dim=embedding_matrix.shape[1],
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')
])

model_FastText_Dense.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)



In [190]:
Hitsory = model_FastText_Dense.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)

Epoch 1/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.2549 - loss: 1.6279 - val_accuracy: 0.3593 - val_loss: 1.5020
Epoch 2/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.5070 - loss: 1.2413 - val_accuracy: 0.3428 - val_loss: 1.4990
Epoch 3/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.6499 - loss: 0.9250 - val_accuracy: 0.3952 - val_loss: 1.5835
Epoch 4/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.7587 - loss: 0.6828 - val_accuracy: 0.3743 - val_loss: 1.6385
Epoch 5/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.8311 - loss: 0.5075 - val_accuracy: 0.3967 - val_loss: 1.8041
Epoch 6/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9037 - loss: 0.3271 - val_accuracy: 0.4027 - val_loss: 1.8937
Epoch 7/10
[1m49/49[0m [32m━━━━

In [191]:
# --- Prédiction ---
y_pred_probs = model_FastText_Dense.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

df_results = pd.concat([
    df_results,
    pd.DataFrame([{"Model": "FastText + Dense --> Nouveau", "Accuracy": acc}])
], ignore_index=True)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Accuracy: 0.4161676646706587

Classification Report:
                precision    recall  f1-score   support

     business       0.48      0.48      0.48       165
entertainment       0.31      0.33      0.32       118
     politics       0.35      0.22      0.27       120
        sport       0.44      0.59      0.50       140
         tech       0.44      0.40      0.42       125

     accuracy                           0.42       668
    macro avg       0.40      0.40      0.40       668
 weighted avg       0.41      0.42      0.41       668



### Reports - Final Result

In [192]:
df_results = df_results.sort_values(by="Accuracy", ascending=False).reset_index(drop=True)

In [193]:
df_results.set_index("Model").head(len(df_results))

Unnamed: 0_level_0,Accuracy
Model,Unnamed: 1_level_1
TF-IDF + Linear SVC,0.976048
TF-IDF + Logistic Regression,0.976048
TF-IDF + MLP(NN),0.976048
TF-IDF + Dense model (Keras fully connected NN,0.976048
Count Vectorizer + Dense model (Keras fully connected NN,0.973054
Count Vectorizer + Logistic Regression,0.97006
Word2Vec + BiLSTM,0.76497
Word2Vec + Dense --> Nouveau,0.621257
FastText + Dense --> Nouveau,0.416168


# Final Conclusion
### 1. Results

| Model                                                    |   Accuracy |
|:---------------------------------------------------------|-----------:|
| TF-IDF + Linear SVC                                      |   0.976048 |
| TF-IDF + Logistic Regression                             |   0.976048 |
| TF-IDF + MLP(NN)                                         |   0.976048 |
| TF-IDF + Dense model (Keras fully connected NN           |   0.976048 |
| Count Vectorizer + Dense model (Keras fully connected NN |   0.973054 |
| Count Vectorizer + Logistic Regression                   |   0.97006  |
| Word2Vec + BiLSTM                                        |   0.76497  |
| Word2Vec + Dense --> Nouveau                             |   0.621257 |
| FastText + Dense --> Nouveau                             |   0.416168 |


### 2. Conclusion

Based on the results obtained, TF-IDF proves to be the most effective representation for this news classification task, with all TF-IDF-based models achieving an accuracy of 0.976. Count Vectorizer with a Dense Keras NN also performs well (0.973) and can be considered a viable alternative.

In contrast, Word2Vec embeddings perform poorly, both with BiLSTM (0.765) and Dense NN (0.621), indicating that simple Word2Vec embeddings trained on this dataset do not capture sufficient semantic information. Pre-trained FastText embeddings also underperform (0.416), suggesting that without proper fine-tuning or alignment with the dataset, pre-trained embeddings may not provide meaningful advantages.

These results highlight that TF-IDF and traditional vectorizers remain the most reliable approach for this dataset. Future steps could focus on testing improved embeddings with fine-tuning, exploring hybrid approaches, or experimenting with larger datasets to determine if Word2Vec or FastText can be made competitive.