# Import

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

from nltk.tokenize import word_tokenize

from tqdm import tqdm
import nltk
import re
import string

from nltk.stem import WordNetLemmatizer

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Vectorizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Bidirectional, LSTM
from tensorflow.keras.optimizers import Adam

In [51]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Dataframe

In [52]:
df = pd.read_csv("bbc-news-data.csv", sep='\t')

In [53]:
df['total_content']=df['title'] + ' ' + df['content']
df = df.drop(columns=['filename','title','content'])
df.head()

Unnamed: 0,category,total_content
0,business,Ad sales boost Time Warner profit Quarterly p...
1,business,Dollar gains on Greenspan speech The dollar h...
2,business,Yukos unit buyer faces loan claim The owners ...
3,business,High fuel prices hit BA's profits British Air...
4,business,Pernod takeover talk lifts Domecq Shares in U...


In [54]:
df = df.map(lambda line : line.lower())
df.head()

Unnamed: 0,category,total_content
0,business,ad sales boost time warner profit quarterly p...
1,business,dollar gains on greenspan speech the dollar h...
2,business,yukos unit buyer faces loan claim the owners ...
3,business,high fuel prices hit ba's profits british air...
4,business,pernod takeover talk lifts domecq shares in u...


# Tokenization

In [55]:
# Listes pour stocker les tokens et les variétés
content_token = []
category_token = []

# Parcours des descriptions
for i, text_description in enumerate(tqdm(df["total_content"], desc="Tokenizing")):
    words_token = word_tokenize(text_description)
    content_token.append(words_token)
    category_token.append(df.loc[i, "category"])  # récupère la variété correspondante

# Création du DataFrame final
df_tokenise = pd.DataFrame({
    "total_content": content_token,
    "category": category_token
})

Tokenizing: 100%|██████████| 2225/2225 [00:02<00:00, 764.54it/s]


In [None]:
def clean_tokens(tokens):
    cleaned = [re.sub(f"[{string.punctuation}0-9]", "", w) for w in tokens]
    # Supprimer les tokens vides après nettoyage
    return [w for w in cleaned if w != ""]

# Appliquer sur la colonne
df_tokenise['total_content_clean'] = df_tokenise['total_content'].apply(clean_tokens)

In [57]:
df_tokenise.head()

Unnamed: 0,total_content,category,total_content_clean
0,"[ad, sales, boost, time, warner, profit, quart...",business,"[ad, sales, boost, time, warner, profit, quart..."
1,"[dollar, gains, on, greenspan, speech, the, do...",business,"[dollar, gains, on, greenspan, speech, the, do..."
2,"[yukos, unit, buyer, faces, loan, claim, the, ...",business,"[yukos, unit, buyer, faces, loan, claim, the, ..."
3,"[high, fuel, prices, hit, ba, 's, profits, bri...",business,"[high, fuel, prices, hit, ba, s, profits, brit..."
4,"[pernod, takeover, talk, lifts, domecq, shares...",business,"[pernod, takeover, talk, lifts, domecq, shares..."


# Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

varieties_lemmat = []
description_lemmat = []

for i, list_token in enumerate(tqdm(df_tokenise["total_content"], desc="Lemmatizating")):    
    # loop for stemming each word
    description_lemmat.append([lemmatizer.lemmatize(word) for word in list_token]) 
    
    varieties_lemmat.append(df_tokenise.loc[i, "category"])  # variety
    
# Création du DataFrame final
df_lemmat = pd.DataFrame({
    "total_content": description_lemmat,
    "category": varieties_lemmat
})

Lemmatizating: 100%|██████████| 2225/2225 [00:02<00:00, 1035.65it/s]


In [59]:
df_lemmat.head()

Unnamed: 0,total_content,category
0,"[ad, sale, boost, time, warner, profit, quarte...",business
1,"[dollar, gain, on, greenspan, speech, the, dol...",business
2,"[yukos, unit, buyer, face, loan, claim, the, o...",business
3,"[high, fuel, price, hit, ba, 's, profit, briti...",business
4,"[pernod, takeover, talk, lift, domecq, share, ...",business


In [60]:
nltk.download('stopwords')
from nltk.corpus import stopwords

#List of stop words
stop_words = set(stopwords.words('english'))

#Delete stopwords
df_lemmat['total_content'] = df_lemmat['total_content'].apply(
    lambda x: [word for word in x if word not in stop_words]
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [61]:
df_lemmat.head()

Unnamed: 0,total_content,category
0,"[ad, sale, boost, time, warner, profit, quarte...",business
1,"[dollar, gain, greenspan, speech, dollar, ha, ...",business
2,"[yukos, unit, buyer, face, loan, claim, owner,...",business
3,"[high, fuel, price, hit, ba, 's, profit, briti...",business
4,"[pernod, takeover, talk, lift, domecq, share, ...",business


In [None]:
# Transformer les labels en entiers
le = LabelEncoder()
y = le.fit_transform(df['category'])
num_classes = len(np.unique(y))

In [63]:
# Transforme en liste de chaine de caractere pour la vectorization 
df_lemmat["total_content"] = df_lemmat["total_content"].apply(
    lambda x : " ".join(x) 
)

In [64]:
liste_combi = []
list_acc = []

# TFIDF 

In [None]:
vectorizer = TfidfVectorizer()
df_vector = vectorizer.fit_transform(df_lemmat["total_content"])

In [66]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(df_vector, y, test_size=0.3, random_state=42)

### LinearSVC

In [None]:
model_SVM = SVC(kernel='linear')  # kernel linéaire pour texte
model_SVM.fit(X_train, y_train)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [68]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Linear SVC")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### Logistic Regression

In [None]:
model_LogisticRegression = LogisticRegression(
    solver='lbfgs', 
    max_iter=1000, 
    random_state=42
)
model_LogisticRegression.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [70]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Logistic Regression")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### MLP (NN)

In [None]:
model_MLP = MLPClassifier(
    hidden_layer_sizes=(100,),  
    activation='logistic',
    solver='adam',
    max_iter=300,
    random_state=42
)
model_MLP.fit(X_train, y_train)

0,1,2
,hidden_layer_sizes,"(100,)"
,activation,'logistic'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,300
,shuffle,True


In [72]:
# --- Prédiction ---
y_pred = model_SVM.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + MLP(NN)")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.99      0.99       118
     politics       0.97      0.96      0.96       120
        sport       0.98      0.99      0.99       140
         tech       0.98      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



### Dense model (Keras fully connected NN)

In [None]:
model_Dense = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')  # softmax pour classification multi-classes
])

model_Dense.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model_Dense.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)


In [None]:
# --- Prédiction ---
y_pred_probs = model_Dense.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("TF-IDF + Dense model (Keras fully connected NN")
list_acc.append(acc)

Accuracy: 0.9760479041916168

Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.96      0.97       165
entertainment       0.98      0.98      0.98       118
     politics       0.97      0.96      0.96       120
        sport       0.99      0.99      0.99       140
         tech       0.97      0.98      0.98       125

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668



# Count Vectorizer

In [None]:
# Vectorisation (sac de mots)
vectorizer = CountVectorizer()
df_vector = vectorizer.fit_transform(df_lemmat["total_content"])

In [74]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(df_vector, y, test_size=0.3, random_state=42)

### Logistic Regression

In [None]:
# Modèle
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [76]:
# --- Prédiction ---
y_pred = lr_model.predict(X_test)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("Count Vectorizer + Logistic Regression")
list_acc.append(acc)

Accuracy: 0.9700598802395209

Classification Report:
                precision    recall  f1-score   support

     business       0.96      0.95      0.95       165
entertainment       0.96      0.99      0.97       118
     politics       0.96      0.96      0.96       120
        sport       0.97      1.00      0.99       140
         tech       1.00      0.96      0.98       125

     accuracy                           0.97       668
    macro avg       0.97      0.97      0.97       668
 weighted avg       0.97      0.97      0.97       668



### Dense model (Keras fully connected NN)

# Word2Vec + BiLSTM

### Word2Vec

In [None]:
# Préparer les documents tokenisés (liste de mots par doc)
documents = [doc.split() for doc in df_lemmat["total_content"]]

In [None]:
w2v_model = Word2Vec(
    sentences=documents,
    vector_size=100,   # dimension des vecteurs
    window=5,
    min_count=1,
    workers=4,
    seed=42
)

In [None]:
# Créer un dictionnaire des mots 
word_index = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

In [None]:
# Construire la matrice d'embeddings
embedding_matrix = np.zeros((len(word_index)+1, w2v_model.vector_size))
for word, i in word_index.items():
    embedding_matrix[i] = w2v_model.wv[word]

In [None]:
# Transformer documents en séquences d'indices
sequences = [[word_index.get(w, 0) for w in doc] for doc in documents]

In [None]:
# Padding pour que toutes les séquences aient la même longueur
max_len = 100  # longueur max des séquences
X_seq = pad_sequences(sequences, maxlen=max_len)

In [None]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y, test_size=0.3, random_state=42
)

### BiLSTM

In [None]:
model_BiLSTM = Sequential([
    Embedding(input_dim=len(word_index)+1, 
              output_dim=w2v_model.vector_size,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False),  # embeddings gelés
    Bidirectional(LSTM(64)),
    Dense(num_classes, activation='softmax')  # classification multiclass
])

model_BiLSTM.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',  # labels entiers
    metrics=['accuracy']
)

# Entraînement
model_BiLSTM.fit(X_train, y_train, validation_data=(X_test, y_test),
          epochs=10, batch_size=32, verbose=1)

In [None]:
# --- Prédiction ---
y_pred_probs = model_BiLSTM.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# --- Évaluation ---
acc = accuracy_score(y_test, y_pred)

print("Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

liste_combi.append("Word2Vec + BiLSTM")
list_acc.append(acc)

# Résultat

In [None]:
df_results = pd.DataFrame({
    "Model": liste_combi,
    "Accuracy": list_acc
})

In [None]:
df_results.head(len(df_results))