# Import

In [99]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

from nltk.tokenize import word_tokenize

from tqdm import tqdm
import nltk
import re
import string

from nltk.stem import WordNetLemmatizer

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Vectorizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM
from tensorflow.keras.optimizers import Adam

In [100]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Dataframe

In [101]:
df = pd.read_csv("bbc-news-data.csv", sep='\t')

In [102]:
df['total_content']=df['title'] + ' ' + df['content']
df = df.drop(columns=['filename','title','content'])
df.head()

Unnamed: 0,category,total_content
0,business,Ad sales boost Time Warner profit Quarterly p...
1,business,Dollar gains on Greenspan speech The dollar h...
2,business,Yukos unit buyer faces loan claim The owners ...
3,business,High fuel prices hit BA's profits British Air...
4,business,Pernod takeover talk lifts Domecq Shares in U...


In [103]:
df = df.map(lambda line : line.lower())
df.head()

Unnamed: 0,category,total_content
0,business,ad sales boost time warner profit quarterly p...
1,business,dollar gains on greenspan speech the dollar h...
2,business,yukos unit buyer faces loan claim the owners ...
3,business,high fuel prices hit ba's profits british air...
4,business,pernod takeover talk lifts domecq shares in u...


# Tokenization

In [104]:
# Listes pour stocker les tokens et les variétés
content_token = []
category_token = []

# Parcours des descriptions
for i, text_description in enumerate(tqdm(df["total_content"], desc="Tokenizing")):
    words_token = word_tokenize(text_description)
    content_token.append(words_token)
    category_token.append(df.loc[i, "category"])  # récupère la variété correspondante

# Création du DataFrame final
df_tokenise = pd.DataFrame({
    "total_content": content_token,
    "category": category_token
})

Tokenizing: 100%|██████████| 2225/2225 [00:06<00:00, 365.61it/s]


In [105]:
def clean_tokens(tokens):
    cleaned = [re.sub(f"[{string.punctuation}0-9]", "", w) for w in tokens]
    # Supprimer les tokens vides après nettoyage
    return [w for w in cleaned if w != ""]

# Appliquer sur la colonne
df_tokenise['total_content_clean'] = df_tokenise['total_content'].apply(clean_tokens)

In [106]:
df_tokenise.head()

Unnamed: 0,total_content,category,total_content_clean
0,"[ad, sales, boost, time, warner, profit, quart...",business,"[ad, sales, boost, time, warner, profit, quart..."
1,"[dollar, gains, on, greenspan, speech, the, do...",business,"[dollar, gains, on, greenspan, speech, the, do..."
2,"[yukos, unit, buyer, faces, loan, claim, the, ...",business,"[yukos, unit, buyer, faces, loan, claim, the, ..."
3,"[high, fuel, prices, hit, ba, 's, profits, bri...",business,"[high, fuel, prices, hit, ba, s, profits, brit..."
4,"[pernod, takeover, talk, lifts, domecq, shares...",business,"[pernod, takeover, talk, lifts, domecq, shares..."


# Lemmatization

In [107]:
lemmatizer = WordNetLemmatizer()

varieties_lemmat = []
description_lemmat = []

for i, list_token in enumerate(tqdm(df_tokenise["total_content"], desc="Lemmatizating")):
    # loop for stemming each word
    description_lemmat.append([lemmatizer.lemmatize(word) for word in list_token])

    varieties_lemmat.append(df_tokenise.loc[i, "category"])  # variety

# Création du DataFrame final
df_lemmat = pd.DataFrame({
    "total_content": description_lemmat,
    "category": varieties_lemmat
})

Lemmatizating: 100%|██████████| 2225/2225 [00:04<00:00, 516.46it/s]


In [108]:
df_lemmat.head()

Unnamed: 0,total_content,category
0,"[ad, sale, boost, time, warner, profit, quarte...",business
1,"[dollar, gain, on, greenspan, speech, the, dol...",business
2,"[yukos, unit, buyer, face, loan, claim, the, o...",business
3,"[high, fuel, price, hit, ba, 's, profit, briti...",business
4,"[pernod, takeover, talk, lift, domecq, share, ...",business


In [109]:
nltk.download('stopwords')
from nltk.corpus import stopwords

#List of stop words
stop_words = set(stopwords.words('english'))

#Delete stopwords
df_lemmat['total_content'] = df_lemmat['total_content'].apply(
    lambda x: [word for word in x if word not in stop_words]
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [110]:
df_lemmat.head()

Unnamed: 0,total_content,category
0,"[ad, sale, boost, time, warner, profit, quarte...",business
1,"[dollar, gain, greenspan, speech, dollar, ha, ...",business
2,"[yukos, unit, buyer, face, loan, claim, owner,...",business
3,"[high, fuel, price, hit, ba, 's, profit, briti...",business
4,"[pernod, takeover, talk, lift, domecq, share, ...",business


In [111]:
# Transformer les labels en entiers
le = LabelEncoder()
y = le.fit_transform(df['category'])
num_classes = len(np.unique(y))

In [112]:
# Transforme en liste de chaine de caractere pour la vectorization
df_lemmat["total_content"] = df_lemmat["total_content"].apply(
    lambda x : " ".join(x)
)

In [113]:
liste_combi = []
list_acc = []

# TFIDF

In [114]:
vectorizer = TfidfVectorizer()
df_vector = vectorizer.fit_transform(df_lemmat["total_content"])

In [115]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(df_vector, y, test_size=0.3, random_state=42)

### LinearSVC

In [116]:
model_SVM = SVC(kernel='linear')  # kernel linéaire pour texte
model_SVM.fit(X_train, y_train)

In [117]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Linear SVC")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### Logistic Regression

In [118]:
model_LogisticRegression = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
model_LogisticRegression.fit(X_train, y_train)


In [119]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Logistic Regression")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### MLP (NN)

In [120]:
model_MLP = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='logistic',
    solver='adam',
    max_iter=300,
    random_state=42
)
model_MLP.fit(X_train, y_train)

In [121]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + MLP(NN)")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### Dense model (Keras fully connected NN)

In [122]:
model_Dense = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')  # softmax pour classification multi-classes
])

model_Dense.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model_Dense.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - accuracy: 0.5764 - loss: 1.4821
Epoch 2/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 47ms/step - accuracy: 0.9935 - loss: 0.3438
Epoch 3/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - accuracy: 0.9982 - loss: 0.0353
Epoch 4/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 1.0000 - loss: 0.0121
Epoch 5/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 1.0000 - loss: 0.0061
Epoch 6/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 1.0000 - loss: 0.0037
Epoch 7/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 49ms/step - accuracy: 1.0000 - loss: 0.0025
Epoch 8/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 59ms/step - accuracy: 1.0000 - loss: 0.0021
Epoch 9/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x79e93434a600>

In [123]:
# --- Prédiction ---
y_pred_probs = model_Dense.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Dense model (Keras fully connected NN")
list_acc.append(acc)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Accuracy: 0.9775449101796407

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.95      0.97       165
entertainment       0.97      0.98      0.98       118
     politics       0.97      0.97      0.97       120
        sport       0.99      0.99      0.99       140
         tech       0.96      0.99      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



# Count Vectorizer

In [124]:
# Vectorisation (sac de mots)
vectorizer = CountVectorizer()
df_vector = vectorizer.fit_transform(df_lemmat["total_content"])

In [125]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(df_vector, y, test_size=0.3, random_state=42)

### Logistic Regression

In [126]:
# Modèle
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [127]:
# --- Prédiction ---
y_pred = lr_model.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("Count Vectorizer + Logistic Regression")
list_acc.append(acc)

Accuracy: 0.9700598802395209

Classification Report:
                precision    recall  f1-score   support

     business       0.96      0.95      0.95       165
entertainment       0.96      0.99      0.97       118
     politics       0.96      0.96      0.96       120
        sport       0.97      1.00      0.99       140
         tech       1.00      0.96      0.98       125

     accuracy                           0.97       668
    macro avg       0.97      0.97      0.97       668
 weighted avg       0.97      0.97      0.97       668



### Dense model (Keras fully connected NN)

In [128]:
model_Dense = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')  # softmax pour classification multi-classes
])

model_Dense.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Convertir en dense
X_train_dense = X_train.toarray()

model_Dense.fit(X_train_dense, y_train, epochs=10, batch_size=32, verbose=1)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - accuracy: 0.7122 - loss: 0.9354
Epoch 2/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.9958 - loss: 0.0252
Epoch 3/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - accuracy: 1.0000 - loss: 0.0034
Epoch 4/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 1.0000 - loss: 0.0026
Epoch 5/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 67ms/step - accuracy: 1.0000 - loss: 7.0830e-04
Epoch 6/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - accuracy: 1.0000 - loss: 4.8761e-04
Epoch 7/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 1.0000 - loss: 5.3832e-04
Epoch 8/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 1.0000 - loss: 3.1521e-04
Epoch 9/10
[1m49/49[0m [32m━━

<keras.src.callbacks.history.History at 0x79e942196480>

In [129]:
# --- Prédiction ---
X_test_dense = X_test.toarray()

y_pred_probs = model_Dense.predict(X_test_dense)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("Count Vectorizer + Dense model (Keras fully connected NN")
list_acc.append(acc)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Accuracy: 0.9790419161676647

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.95      0.97       165
entertainment       0.98      1.00      0.99       118
     politics       0.96      0.97      0.97       120
        sport       0.99      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



# Word2Vec + BiLSTM

### Word2Vec

In [130]:
# Préparer les documents tokenisés (liste de mots par doc)
documents = [doc.split() for doc in df_lemmat["total_content"]]

In [131]:
w2v_model = Word2Vec(
    sentences=documents,
    vector_size=100,   # dimension des vecteurs
    window=5,
    min_count=1,
    workers=4,
    seed=42
)

In [132]:
# Créer un dictionnaire des mots
word_index = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

In [133]:
# Construire la matrice d'embeddings
embedding_matrix = np.zeros((len(word_index)+1, w2v_model.vector_size))
for word, i in word_index.items():
    embedding_matrix[i] = w2v_model.wv[word]

In [134]:
# Transformer documents en séquences d'indices
sequences = [[word_index.get(w, 0) for w in doc] for doc in documents]

In [135]:
# Padding pour que toutes les séquences aient la même longueur
max_len = 100  # longueur max des séquences
X_seq = pad_sequences(sequences, maxlen=max_len)

In [136]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y, test_size=0.3, random_state=42
)

### BiLSTM

In [137]:
model_BiLSTM = Sequential([
    Embedding(input_dim=len(word_index)+1,
              output_dim=w2v_model.vector_size,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),  # embeddings gelés
    Bidirectional(LSTM(64)),
    Dense(num_classes, activation='softmax')  # classification multiclass
])

model_BiLSTM.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',  # labels entiers
    metrics=['accuracy']
)

# Entraînement
model_BiLSTM.fit(X_train, y_train, validation_data=(X_test, y_test),
          epochs=10, batch_size=32, verbose=1)

Epoch 1/10




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 111ms/step - accuracy: 0.3193 - loss: 1.5092 - val_accuracy: 0.5569 - val_loss: 1.1360
Epoch 2/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 127ms/step - accuracy: 0.5722 - loss: 1.1082 - val_accuracy: 0.6063 - val_loss: 1.0192
Epoch 3/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 100ms/step - accuracy: 0.6349 - loss: 0.9462 - val_accuracy: 0.5883 - val_loss: 1.0418
Epoch 4/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 102ms/step - accuracy: 0.6652 - loss: 0.9156 - val_accuracy: 0.6692 - val_loss: 0.8100
Epoch 5/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 130ms/step - accuracy: 0.7027 - loss: 0.7753 - val_accuracy: 0.6946 - val_loss: 0.8084
Epoch 6/10
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 139ms/step - accuracy: 0.7256 - loss: 0.7567 - val_accuracy: 0.6826 - val_loss: 0.8135
Epoch 7/10
[1m49/49[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x79e932ada6c0>

In [138]:
# --- Prédiction ---
y_pred_probs = model_BiLSTM.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("Word2Vec + BiLSTM")
list_acc.append(acc)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step
Accuracy: 0.6751497005988024

Classification Report:
                precision    recall  f1-score   support

     business       0.84      0.71      0.77       165
entertainment       0.61      0.31      0.41       118
     politics       0.62      0.86      0.72       120
        sport       0.56      0.94      0.70       140
         tech       0.91      0.51      0.66       125

     accuracy                           0.68       668
    macro avg       0.71      0.66      0.65       668
 weighted avg       0.72      0.68      0.66       668



# Résultat

In [139]:
df_results = pd.DataFrame({
    "Model": liste_combi,
    "Accuracy": list_acc
})

df_results = df_results.sort_values(by="Accuracy", ascending=False).reset_index(drop=True).set_index("Model")

In [140]:
df_results.head(len(df_results))

Unnamed: 0_level_0,Accuracy
Model,Unnamed: 1_level_1
Count Vectorizer + Dense model (Keras fully connected NN,0.979042
TF-IDF + Dense model (Keras fully connected NN,0.977545
TF-IDF + Linear SVC,0.976048
TF-IDF + MLP(NN),0.976048
TF-IDF + Logistic Regression,0.976048
Count Vectorizer + Logistic Regression,0.97006
Word2Vec + BiLSTM,0.67515


# Reports – Results of the Combinations

### 1. Objective:

The objective of this lab was to test different combinations of vectorizers and classification models to predict news categories. We systematically tested TF-IDF, CountVectorizer, and Word2Vec as text representations, combined with several models: Logistic Regression, Linear SVM, scikit-learn MLP (NN), Dense Keras NN, and BiLSTM Keras.

### 2. Results :

| Model                                         | Accuracy |
|-----------------------------------------------|----------|
| Count Vectorizer + Dense model (Keras fully connected NN) | 0.979042 |
| TF-IDF + Dense model (Keras fully connected NN) | 0.977545 |
| TF-IDF + Linear SVC                            | 0.976048 |
| TF-IDF + MLP (NN)                              | 0.976048 |
| TF-IDF + Logistic Regression                   | 0.976048 |
| Count Vectorizer + Logistic Regression        | 0.970060 |
| Word2Vec + BiLSTM                              | 0.675150 |

### 3. Analysis :
The results show that Count Vectorizer combined with a Dense Keras NN now achieves the highest accuracy at 0.9790, slightly outperforming TF-IDF with Dense Keras at 0.9775. This suggests that when a dense model is used, Count Vectorizer can even surpass TF-IDF, likely because the dense network can capture nonlinear relationships in raw count vectors.

Linear models and the scikit-learn MLP with TF-IDF remain very competitive around 0.9760, confirming the effectiveness of TF-IDF for news classification even with simpler models. In contrast, Word2Vec with a BiLSTM performs significantly worse (0.6751), suggesting that either the BiLSTM has not been fully optimized or that the Word2Vec embeddings do not sufficiently capture the relevant text structure for this dataset.

Adding dense layers in a Keras Dense NN seems to provide a notable advantage for traditional vectorizers (TF-IDF and Count Vectorizer), whereas Word2Vec would require improvements such as fine-tuning embeddings, using pre-trained embeddings, or optimizing BiLSTM hyperparameters.

### 4. Preliminary Conclusion :
For this dataset and the tested configurations, Count Vectorizer combined with a Dense Keras NN currently offers the best performance with an accuracy of 0.9790, followed closely by TF-IDF with Dense Keras. Linear models remain effective but slightly behind, while Word2Vec with BiLSTM does not reach competitive performance in the current setup. Next steps could include testing pre-trained embeddings, tuning BiLSTM hyperparameters, exploring other neural network architectures or vector combinations, and evaluating the effect of data augmentation and regularization on final performance.