In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Add Path of the dataset below .Dataset is in the zip folder. If dataset in google drive please mount the drive Using the above line of the code **

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack

# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/bbc-text.csv", encoding="ISO-8859-1")

# Show the first 5 Rows
print(data.head())


        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...


In [None]:
articles = data['text']  # Naming column 'articles' which has text
labels = data['category']  # Naming Columns 'Labels' which has category



In [None]:
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import numpy as np

# Download NLTK punkt and stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Defining english stopwords and the punctuation marks
stop_words = set(stopwords.words('english'))
punctuations = string.punctuation

def preprocess(text):
    # Tokenize the data using word.tokenize that is in nltk.tokenize
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stopwords using words.isaplha which removes words that are alpha-numeric,numbers and also punctuation marks
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# creating a new column with the preprocessing of text column and calling the above function to preprocess it
data['tokens'] = data['text'].apply(preprocess)

# Show the first few tokenized texts in the tokens column
print(data['tokens'].head())


0    [tv, future, hands, viewers, home, theatre, sy...
1    [worldcom, boss, left, books, alone, former, w...
2    [tigers, wary, farrell, gamble, leicester, say...
3    [yeading, face, newcastle, fa, cup, premiershi...
4    [ocean, twelve, raids, box, office, ocean, twe...
Name: tokens, dtype: object


In [None]:
# Define Word2Vec parameters
vector_size = 100  # vectors as number of this increaes it stores more info anout the word
window = 5         #  here it checks 5 words before and after the target word
min_count = 2      # Ignores all words with total number of appearances  lower than this , if a word occurs less than 2 it ignores it in model
workers = 4        # Number of worker threads to train the model

# Train the Word2Vec model on tokens column on above parameters
w2v_model = Word2Vec(sentences=data['tokens'],
                     vector_size=vector_size,
                     window=window,
                     min_count=min_count,
                     workers=workers)

# Building the vocabulary and training the model on the columns and train on number of epochs which default is 5
w2v_model.build_vocab(data['tokens'])
w2v_model.train(data['tokens'], total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)




(2243054, 2343295)

In [None]:
# Preprocessing as tfidf vector and also reducing the dimensionality using max features
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(articles)

In [None]:
# Using Ngram for feature extraction and  also reducing the dimensionality using max features
ngram_vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=1000)
X_ngrams = vectorizer.fit_transform(data['text'])


In [None]:
# Below function is used to make vector of tokens
def get_document_vector(tokens, model):
    # Initialize an empty vector
    doc_vector = np.zeros(model.vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            doc_vector += model.wv[word]
            count += 1
    if count != 0:
        doc_vector /= count
    return doc_vector

# Apply the function to create document vectors
data['doc_vector'] = data['tokens'].apply(lambda x: get_document_vector(x, w2v_model))

# Convert the list of vectors into a numpy array
X = np.vstack(data['doc_vector'].values)
y = data['category']  # Assuming your labels are in the 'category' column

# Display the shape of the feature matrix
print("Feature matrix shape:", X.shape)


Feature matrix shape: (2225, 100)


In [None]:
from scipy.sparse import csr_matrix
X_w2v_sparse = csr_matrix(X)
# using sparse matrix to be computationally more fast
# Combine the Word2Vec, TF-IDF, and N-gram features using hstack that is all 3 features
X_combined = hstack([X_w2v_sparse, X_tfidf, X_ngrams])


In [None]:
from sklearn.model_selection import train_test_split

# First split: Train + Dev = temp vs Test which is 80-20
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Second split: Train vs Dev from the X temp 80% is now divided as 60-20
X_train, X_dev, y_train, y_dev = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print("Training set size:", X_train.shape)
print("Development set size:", X_dev.shape)
print("Test set size:", X_test.shape)


Training set size: (1335, 100)
Development set size: (445, 100)
Test set size: (445, 100)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Define the Logistic Regression model with manually set parameters acc to the dataset
# where 'c' is used to add penalty and reduce overfitting
# saga solver is used for multiclasss classification
# max iter is used to Set maximum number of iterations the solver will run before stopping
log_reg = LogisticRegression(C=10, solver='saga', max_iter=200)

# Fit the Logistic Regression model on the training data
log_reg.fit(X_train, y_train)

# Predict on the Development set
y_dev_pred = log_reg.predict(X_dev)

# Evaluate on the Development set
print("Development Set Accuracy:", accuracy_score(y_dev, y_dev_pred))
print("Classification Report:\n", classification_report(y_dev, y_dev_pred))




Development Set Accuracy: 0.9325842696629213
Classification Report:
                precision    recall  f1-score   support

     business       0.88      0.94      0.91       102
entertainment       0.95      0.91      0.93        77
     politics       0.99      0.90      0.94        83
        sport       0.98      0.98      0.98       102
         tech       0.88      0.91      0.90        81

     accuracy                           0.93       445
    macro avg       0.93      0.93      0.93       445
 weighted avg       0.93      0.93      0.93       445





In [None]:
# predict and evaluate on the Test set as dev set accuracy is 93%
y_test_pred = log_reg.predict(X_test)
print("Test Set Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))

Test Set Accuracy: 0.9617977528089887
Test Classification Report:
                precision    recall  f1-score   support

     business       0.94      0.97      0.96       102
entertainment       0.95      0.97      0.96        77
     politics       0.97      0.92      0.94        84
        sport       1.00      0.98      0.99       102
         tech       0.94      0.96      0.95        80

     accuracy                           0.96       445
    macro avg       0.96      0.96      0.96       445
 weighted avg       0.96      0.96      0.96       445



In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

# Define the SVM model with use parameters
#  linear kernel to separate data with a straight hyperplane in the feature space
# c = 1 for not overfitting
# Predict class labels without calculating probabilities as it would be computationally more fast
svm_clf = SVC(C=1, kernel='linear', probability=False)

# Fit the SVM model on the training data as above model
svm_clf.fit(X_train, y_train)

# Predict on the Development set
y_dev_pred_svm = svm_clf.predict(X_dev)

# Check accuracy on the Development set
dev_accuracy_svm = accuracy_score(y_dev, y_dev_pred_svm)
print("Development Set Accuracy (SVM):", dev_accuracy_svm)
print("Classification Report (SVM - Dev Set):\n", classification_report(y_dev, y_dev_pred_svm))




Development Set Accuracy (SVM): 0.9325842696629213
Classification Report (SVM - Dev Set):
                precision    recall  f1-score   support

     business       0.88      0.95      0.92       102
entertainment       0.93      0.92      0.93        77
     politics       0.99      0.90      0.94        83
        sport       0.98      0.98      0.98       102
         tech       0.89      0.89      0.89        81

     accuracy                           0.93       445
    macro avg       0.93      0.93      0.93       445
 weighted avg       0.93      0.93      0.93       445



In [None]:
# Predict and check accuracy on the Test set as dev set accuracy is 92.8%
y_test_pred_svm = svm_clf.predict(X_test)
test_accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
print("Test Set Accuracy (SVM):", test_accuracy_svm)
print("Classification Report (SVM - Test Set):\n", classification_report(y_test, y_test_pred_svm))

Test Set Accuracy (SVM): 0.9640449438202248
Classification Report (SVM - Test Set):
                precision    recall  f1-score   support

     business       0.95      0.96      0.96       102
entertainment       0.97      0.99      0.98        77
     politics       0.97      0.93      0.95        84
        sport       0.98      0.98      0.98       102
         tech       0.94      0.96      0.95        80

     accuracy                           0.96       445
    macro avg       0.96      0.96      0.96       445
 weighted avg       0.96      0.96      0.96       445



**Below we can add new txt to check the above models **

In [None]:
# Example of a new text
new_text = "The Share Market has been increasing day by day"

# Preprocess and create feature vector call the above functions
tokens = preprocess(new_text)
new_text_vector = get_document_vector(tokens, w2v_model)

# Reshape to 2D array since the model expects a vector to fit in the model
new_text_vector = new_text_vector.reshape(1, -1)
# Make a prediction using above trained model and loading the new vector
predicted_category = log_reg.predict(new_text_vector)

# Show the reult of predeiction
print("Predicted Category:", predicted_category[0])


Predicted Category: business
