In [68]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import nltk
import re

In [69]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [70]:
df = pd.read_csv("bbc-news-data.csv", sep='\t')

In [71]:
df['total_content']=df['title'] + ' ' + df['content']
df = df.drop(columns=['filename','title','content'])
df.head()

Unnamed: 0,category,total_content
0,business,Ad sales boost Time Warner profit Quarterly p...
1,business,Dollar gains on Greenspan speech The dollar h...
2,business,Yukos unit buyer faces loan claim The owners ...
3,business,High fuel prices hit BA's profits British Air...
4,business,Pernod takeover talk lifts Domecq Shares in U...


In [72]:
df = df.map(lambda line : line.lower())
df.head()

Unnamed: 0,category,total_content
0,business,ad sales boost time warner profit quarterly p...
1,business,dollar gains on greenspan speech the dollar h...
2,business,yukos unit buyer faces loan claim the owners ...
3,business,high fuel prices hit ba's profits british air...
4,business,pernod takeover talk lifts domecq shares in u...


Tokenization

In [73]:
# Listes pour stocker les tokens et les variétés
content_token = []
category_token = []

# Parcours des descriptions
for i, text_description in enumerate(tqdm(df["total_content"], desc="Tokenizing")):
    words_token = word_tokenize(text_description)
    content_token.append(words_token)
    category_token.append(df.loc[i, "category"])  # récupère la variété correspondante

# Création du DataFrame final
df_tokenise = pd.DataFrame({
    "total_content": content_token,
    "category": category_token
})

Tokenizing: 100%|██████████| 2225/2225 [00:03<00:00, 617.29it/s]


In [74]:
import re
import string

def clean_tokens(tokens):
    cleaned = [re.sub(f"[{string.punctuation}0-9]", "", w) for w in tokens]
    # Supprimer les tokens vides après nettoyage
    return [w for w in cleaned if w != ""]

# Appliquer sur la colonne
df_tokenise['total_content_clean'] = df_tokenise['total_content'].apply(clean_tokens)

In [75]:
df_tokenise.head()

Unnamed: 0,total_content,category,total_content_clean
0,"[ad, sales, boost, time, warner, profit, quart...",business,"[ad, sales, boost, time, warner, profit, quart..."
1,"[dollar, gains, on, greenspan, speech, the, do...",business,"[dollar, gains, on, greenspan, speech, the, do..."
2,"[yukos, unit, buyer, faces, loan, claim, the, ...",business,"[yukos, unit, buyer, faces, loan, claim, the, ..."
3,"[high, fuel, prices, hit, ba, 's, profits, bri...",business,"[high, fuel, prices, hit, ba, s, profits, brit..."
4,"[pernod, takeover, talk, lifts, domecq, shares...",business,"[pernod, takeover, talk, lifts, domecq, shares..."


In [76]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

varieties_lemmat = []
description_lemmat = []

for i, list_token in enumerate(tqdm(df_tokenise["total_content"], desc="Lemmatizating")):    
    # loop for stemming each word
    description_lemmat.append([lemmatizer.lemmatize(word) for word in list_token]) 
    
    varieties_lemmat.append(df_tokenise.loc[i, "category"])  # variety
    
# Création du DataFrame final
df_lemmat = pd.DataFrame({
    "total_content": description_lemmat,
    "category": varieties_lemmat
})

Lemmatizating: 100%|██████████| 2225/2225 [00:02<00:00, 770.04it/s]


In [77]:
df_lemmat.head()

Unnamed: 0,total_content,category
0,"[ad, sale, boost, time, warner, profit, quarte...",business
1,"[dollar, gain, on, greenspan, speech, the, dol...",business
2,"[yukos, unit, buyer, face, loan, claim, the, o...",business
3,"[high, fuel, price, hit, ba, 's, profit, briti...",business
4,"[pernod, takeover, talk, lift, domecq, share, ...",business


In [78]:
nltk.download('stopwords')
from nltk.corpus import stopwords

#List of stop words
stop_words = set(stopwords.words('english'))

#Delete stopwords
df_lemmat['total_content'] = df_lemmat['total_content'].apply(
    lambda x: [word for word in x if word not in stop_words]
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [79]:
df_lemmat.head()

Unnamed: 0,total_content,category
0,"[ad, sale, boost, time, warner, profit, quarte...",business
1,"[dollar, gain, greenspan, speech, dollar, ha, ...",business
2,"[yukos, unit, buyer, face, loan, claim, owner,...",business
3,"[high, fuel, price, hit, ba, 's, profit, briti...",business
4,"[pernod, takeover, talk, lift, domecq, share, ...",business


In [80]:
# Transformer les labels en entiers
le = LabelEncoder()
y = le.fit_transform(df['category'])

In [81]:
# Transforme en liste de chaine de caractere pour la vectorization 
df_lemmat["total_content"] = df_lemmat["total_content"].apply(
    lambda x : " ".join(x) 
)

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
df_lemmat["total_content"] = vectorizer.fit_transform(df_lemmat["total_content"])

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

In [None]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(df_lemmat["total_content"], y, test_size=0.3, random_state=42)

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')  # kernel linéaire pour texte
svm_model.fit(X_train, y_train)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [None]:
# --- Prédiction ---
y_pred = svm_model.predict(X_test)

# --- Évaluation ---
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

ValueError: X has 15536 features, but SVC is expecting 22830 features as input.