In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from collections import Counter
from sklearn.preprocessing import LabelEncoder

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\youji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\youji\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\youji\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
df= pd.read_csv('bbc-text.csv')

# Data processing:

In [4]:
#Import stop words and punctuation from nltk and String
stop_words = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)

#Function for clean text
def clean_text(text):
    tokens = word_tokenize(text) #tokenize text, e.g. ("hello world") to ("hello","world") 
    filtered_tokens = [] 
    for word in tokens:
        if word.lower() not in stop_words and word not in punctuation:
            filtered_tokens.append(word)
    result = ' '.join(filtered_tokens)#Rejoin tokens back to text
    return result

#Replace value in text with value in cleaned_text
df['cleaned_text'] = df['text'].apply(clean_text)
df_filtered = df.drop(columns=["text"])
df_filtered = df_filtered.rename(columns={"cleaned_text": "text"})
df_filtered['text'] = df_filtered['text'].str.lower()

# Feature engineering:

### Adjectives and verbs frequencies:

In [5]:
def get_pos_frequencies(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words) #get verbs and adjective from nltk
    adj_count = 0
    verb_count = 0

    #count verbs and adjectives, JJ is adjective and VB is verb
    for word, pos in pos_tags:
        if pos.startswith('JJ'):
            adj_count += 1
        if pos.startswith('VB'): 
            verb_count += 1

    #calculate frequencies
    total_words = len(words)
    adj_freq = adj_count / total_words if total_words > 0 else 0
    verb_freq = verb_count / total_words if total_words > 0 else 0

    return adj_freq, verb_freq

In [6]:
#Apply previous fequency function
adj_frequencies = []
verb_frequencies = []
for text in df_filtered['text']:
    adj_freq, verb_freq = get_pos_frequencies(text)
    adj_frequencies.append(adj_freq)
    verb_frequencies.append(verb_freq)

#Create adj_frequency and verb_frequency feature for df_filtered
df_filtered['adj_frequency'] = adj_frequencies
df_filtered['verb_frequency'] = verb_frequencies

### Bigram (2 words phrases)

In [7]:
#Use countVecorizer to get ngrams and get the top 10 features, (2,2) means only 2 words phrases counts(uper/lower limit 2)
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=10)
bigram_matrix = vectorizer.fit_transform(df_filtered['text'])

#Place 2gram features back to oringinal dataset
bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df_filtered = pd.concat([df_filtered, bigram_df], axis=1)

### TF-IDF

In [8]:
#get 1000 top frequence features 
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df_filtered['text'])

#Add result back to originial dataset
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df_filtered = pd.concat([df_filtered.drop(columns=["text"]), tfidf_df], axis=1)

In [9]:
# Encode the category column
label_encoder = LabelEncoder()
df_filtered['category_encoded'] = label_encoder.fit_transform(df_filtered['category'])

### Feature select with Select K Best(chi2)

In [10]:
# Features (X) and Target (y)
X = df_filtered.drop(columns=['category', 'category_encoded'])  # Use TF-IDF features
y = df_filtered['category_encoded']

# Apply SelectKBest with Chi-Square
k_best = 100
selector = SelectKBest(score_func=chi2, k=k_best)
X_new = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
# Create a DataFrame with selected features
selected_features_df = pd.DataFrame(X_new, columns=selected_features)

# Combine with target for further modeling
final_df = pd.concat([selected_features_df, df_filtered['category_encoded']], axis=1)

# Model training

### Split train and test datasets

In [13]:
#Split 20% of data as test set
train_val, test = train_test_split(final_df, test_size=0.2, random_state=10)
X_test = test.drop(columns=["category_encoded"])
y_test = test["category_encoded"]

In [14]:
#Split 10% of data as validation set and 70% of data as train set
train, validation = train_test_split(train_val, test_size=0.1, random_state=10)
X_train = train.drop(columns=["category_encoded"])
y_train = train["category_encoded"]
X_val = validation.drop(columns=["category_encoded"])
y_val = validation["category_encoded"]

### train LinearSVC model

In [18]:
#initial Linear SVC model
svc_model = LinearSVC()

In [19]:
#Kfold cross-validation on the train dataset
kfold = KFold(n_splits=5, shuffle=True, random_state=10)
cv_scores = cross_val_score(svc_model, X_train, y_train, cv=kfold, scoring='accuracy')

In [20]:
#train dataset
svc_model.fit(X_train, y_train)

In [24]:
#Evaluate train dataset with result from K-fold.
y_val_pred = svc_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"K-Fold Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean()}")

K-Fold Cross-Validation Scores: [0.9376947 0.9376947 0.925     0.95625   0.95625  ]
Mean CV Accuracy: 0.9425778816199376


# Evaluation

In [22]:
#Predict dataset with test set
y_pred = svc_model.predict(X_test)

In [23]:
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Get the class names from the label encoder and print them
class_names = label_encoder.classes_
encoded_names = {}
for i in range(len(class_names)):
    encoded_names[i] = class_names[i]
print("Encoded to category mapping:", encoded_names)

Accuracy: 0.9191011235955057
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.87      0.86       102
           1       1.00      0.96      0.98        70
           2       0.93      0.87      0.90        94
           3       0.90      0.98      0.94        96
           4       0.95      0.93      0.94        83

    accuracy                           0.92       445
   macro avg       0.93      0.92      0.92       445
weighted avg       0.92      0.92      0.92       445

Encoded to category mapping: {0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}
