In [1]:
import numpy as np
import pandas as pd
import math

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Loading Dataset

In [2]:
df_train=pd.read_csv("../dataset/BBC News Train.csv")
df_test=pd.read_csv("../dataset/BBC News Test.csv")

## PreProcessing

In [3]:
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [4]:
Category_class=sorted(df_train["Category"].unique())
Category_class

['business', 'entertainment', 'politics', 'sport', 'tech']

In [5]:
mapping={'business':0, 'entertainment':1, 'politics':2, 'sport':3, 'tech':4}
df_train['CategoryId']=df_train['Category'].map(mapping)

In [1]:
df_train.groupby('Category').CategoryId.count()

NameError: name 'df_train' is not defined

## Preprocessing Text

In [7]:
def preprocess_text(text):
    text=str(text)
    #lowercasing
    text=text.lower()
    #Remove Stop Words
    stop_words=set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_list = [w for w in word_tokens if not w in stop_words]
    
    
    #Remove numbers and special Symbols
    #words like 100m 2m were not removed so using this
    num=['0','1','2','3','4','5','6','7','8','9']
    num_filter=[]
    for i in range(0,len(filtered_list)):
        for j in range(0,len(num)):
            if num[j] in filtered_list[i]:
                num_filter.append(filtered_list[i])
                break
    
    for filter in num_filter:
        filtered_list.remove(filter)
                
    filtered_list = [w for w in filtered_list if w.isalnum()]
    filtered_list=  [w for w in filtered_list if not w.isdigit()]
    
    
    
    #Lematizing
    wordnet_lemmatizer=WordNetLemmatizer()
    lemmatized_list=[wordnet_lemmatizer.lemmatize(w,wordnet.VERB) for w in filtered_list]
    lemmatized_string=' '.join(lemmatized_list)
    
    return lemmatized_string

In [8]:
print(preprocess_text("Hey studys car's my, mountaining guidings going went better$ 45d"))

hey study car mountaining guide go go better


### Analyzing Processed Text

In [9]:
def rand():
    return np.random.binomial(n=1,p=0.01,size=[1])
def count_word(word):
    cnt=word.count(" ")+1
    return cnt
count=0
for i in range(0,len(df_train)):
    if rand():
        txt=df_train['Text'][i]
        processed_txt=preprocess_text(df_train['Text'][i])
        
        print(txt+'\n \n'+processed_txt)
        print(f"\nMain Text Word Count : {count_word(txt)}\nProcessed Text Word Count : {count_word( processed_txt)}")
        count+=1
    if count==1:
        break
    

aragones angered by racism fine spain coach luis aragones is furious after being fined by the spanish football federation for his comments about thierry henry.  the 66-year-old criticised his 3000 euros (£2 060) punishment even though it was far below the maximum penalty.  i am not guilty  nor do i accept being judged for actions against the image of the sport   he said.  i m not a racist and i ve never lacked sporting decorum. i ve never done that and i have medals for sporting merit.  aragones was handed the fine on tuesday after making racist remarks about henry to arsenal team-mate and spanish international jose reyes last october.  the spanish football federation at first declined to take action against aragones  but was then requested to do so by spain s anti-violence commission. the fine was far less than the expected amount of about £22 000 or even the suspension of his coaching licence. arsenal boss arsene wenger  who was fined £15 000 in december for accusing manchester unite

### Apply Process Text in our Column

In [10]:
df_train['Text']=df_train['Text'].apply(preprocess_text)

### Transforming Text to Vectors for MultiClassification

In [11]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,ngram_range=(1,2))
features = tfidf.fit_transform(df_train['Text']).toarray()
features.shape

(1490, 9663)

In [12]:
print(df_train['Text'][0])
print(df_train['Category'][0])

worldcom launch defence lawyers defend former worldcom chief bernie ebbers battery fraud charge call company whistleblower first witness cynthia cooper worldcom internal account alert directors irregular account practice us telecoms giant warn lead collapse firm follow discovery account fraud mr ebbers plead guilty charge fraud conspiracy prosecution lawyers argue mr ebbers orchestrate series account trick worldcom order employees hide expense inflate revenues meet wall street earn estimate ms cooper run consult business tell jury new york wednesday external auditors arthur andersen approve worldcom account early say andersen give green light procedures practice use worldcom mr ebber lawyers say unaware fraud argue auditors alert problems ms cooper also say shareholder meet mr ebbers often pass technical question company finance chief give brief answer prosecution star witness former worldcom financial chief scott sullivan say mr ebbers order account adjustments firm tell hit book howe

In [13]:
print(tfidf.vocabulary_['lawyers'])
print(tfidf.vocabulary_['account'])

4597
56


In [14]:
print(features[0][4597])
print(features[0][56])

0.15094653115575096
0.14644847942451855


### Displaying highly correlated words in categories

In [15]:
N = 20
category_id_df = df_train[['Category', 'CategoryId']].drop_duplicates()
category_id=dict(category_id_df.values)
labels=df_train['CategoryId']

for Category, category_id in sorted(category_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names_out())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print(f"Category : {Category}")
  print(f"    Most Correlated Unigrams  :  {(', '.join(unigrams[-N:]))}")
  print(f"    Most Correlated Bigrams   :  {(', '.join(bigrams[-N:]))}")
  print("")

Category : business
    Most Correlated Unigrams  :  trade, shareholders, financial, export, stock, investment, investors, rise, analysts, price, company, share, firm, market, economic, economy, profit, oil, growth, bank
    Most Correlated Bigrams   :  price rise, securities exchange, exchange commission, oil giant, share price, tell reuters, year earlier, us dollar, central bank, interest rate, fourth quarter, bank england, stock exchange, consumer spend, chief executive, interest rat, oil price, stock market, economic growth, analysts say

Category : entertainment
    Most Correlated Unigrams  :  nominate, theatre, movie, drama, nominations, hollywood, musical, song, chart, oscar, comedy, festival, actress, band, singer, album, actor, award, star, film
    Most Correlated Bigrams   :  film include, nominate best, us tv, single chart, film star, best support, award best, vera drake, make film, best actor, best director, million dollar, dollar baby, best actress, academy award, film f

### Train Test Split

In [16]:
x=features
y=df_train.loc[:,'CategoryId'].values
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.2,random_state=100)

## Custom Code

In [17]:
class BinarySVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.lr = learning_rate
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.w) + self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.C * self.w)
                else:
                    self.w -= self.lr * (2 * self.C * self.w - np.dot(x_i, y[idx]))
                    self.b -= self.lr * y[idx]

    def predict(self, X):
        linear_output = np.dot(X, self.w) + self.b
        return np.sign(linear_output)


In [18]:
class OneVsOneSVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.classifiers = {}

    def fit(self, X, y):
        self.classes = np.unique(y)
        for i in range(len(self.classes)):
            for j in range(i + 1, len(self.classes)):
                class_i = self.classes[i]
                class_j = self.classes[j]

                # Filter the dataset to include only class_i and class_j
                X_ij = X[(y == class_i) | (y == class_j)]
                y_ij = y[(y == class_i) | (y == class_j)]
                y_ij = np.where(y_ij == class_i, 1, -1)  # Assign labels +1 and -1

                # Train a binary SVM on this subset
                svm = BinarySVM(C=self.C, learning_rate=self.learning_rate, n_iters=self.n_iters)
                svm.fit(X_ij, y_ij)

                # Store the classifier
                self.classifiers[(class_i, class_j)] = svm

    def predict(self, X):
        votes = np.zeros((X.shape[0], len(self.classes)))

        for (class_i, class_j), svm in self.classifiers.items():
            predictions = svm.predict(X)
            for idx, prediction in enumerate(predictions):
                if prediction == 1:
                    votes[idx, np.where(self.classes == class_i)] += 1
                else:
                    votes[idx, np.where(self.classes == class_j)] += 1

        # Determine the final prediction by majority vote
        return self.classes[np.argmax(votes, axis=1)]


In [19]:
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)


In [20]:
ovo_svm.fit(train_x, train_y)

In [23]:
test_predict = ovo_svm.predict(test_x)
train_predict = ovo_svm.predict(train_x)
accuracy = accuracy_score(test_y, test_predict)
print(f"Accuracy : {accuracy}")
print(f"Train Accuracy : {accuracy}")

Accuracy : 0.17785234899328858
Train Accuracy : 0.17785234899328858


### Making Model

In [27]:
model=SVC(kernel='linear',C=10,gamma=1,probability=True)

In [28]:
train_x.shape

(1192, 9663)

In [29]:
model.fit(train_x,train_y)

In [30]:
test_predict = model.predict(test_x)
train_accuracy = round(model.score(train_x,train_y)*100)
test_accuracy =round(accuracy_score(test_predict, test_y)*100)

print(f"Train Accuracy Score : {train_accuracy}")
print(f"Test Accuracy Score  : {test_accuracy}")
print()
print(classification_report(test_predict, test_y, target_names=Category_class))

Train Accuracy Score : 100
Test Accuracy Score  : 98

               precision    recall  f1-score   support

     business       1.00      0.98      0.99        65
entertainment       0.95      0.98      0.97        57
     politics       0.95      0.98      0.97        59
        sport       1.00      1.00      1.00        61
         tech       1.00      0.95      0.97        56

     accuracy                           0.98       298
    macro avg       0.98      0.98      0.98       298
 weighted avg       0.98      0.98      0.98       298



In [32]:
test_x

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.10895123,
        0.11398407],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Custom Data

In [41]:
news = 'City were the first club to win four successive English top-flight championships last term and begin their defence on Sunday when they travel to Chelsea.'

news = preprocess_text(news)
news = tfidf.transform([news]).toarray()

pred = model.predict(news)

proba = model.predict_proba(news)

# Print the probability estimates
print("Probability Estimates:", proba)

# Print the predicted class
print("Predicted Class Index:", pred)
print("Predicted Class:", Category_class[pred[0]])
print(proba)
print(pred)
print(Category_class[pred[0]])


# Optionally, print probabilities for each class in a more readable format
for idx, category in enumerate(Category_class):
    print(f"Class '{category}': Probability {proba[0][idx]}")

Probability Estimates: [[0.00359498 0.00426828 0.00261561 0.98841376 0.00110737]]
Predicted Class Index: [3]
Predicted Class: sport
[[0.00359498 0.00426828 0.00261561 0.98841376 0.00110737]]
[3]
sport
Class 'business': Probability 0.003594977816546315
Class 'entertainment': Probability 0.004268280933809973
Class 'politics': Probability 0.0026156093109433883
Class 'sport': Probability 0.9884137638271514
Class 'tech': Probability 0.0011073681115495158


## Save the model

In [31]:
import joblib
joblib.dump(model, 'model/svm.pkl')

['svm.pkl']

In [35]:
import joblib
joblib.dump(tfidf, 'tfidf.pkl')

['tfidf.pkl']