In [1]:
import numpy as np
import pandas as pd
import math

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
def preprocess_text(text):
    text = str(text)
    # lowercasing
    text = text.lower()
    # Remove Stop Words
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_list = [w for w in word_tokens if not w in stop_words]

    # Remove numbers and special Symbols
    # words like 100m 2m were not removed so using this
    num = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    num_filter = []
    for i in range(0, len(filtered_list)):
        for j in range(0, len(num)):
            if num[j] in filtered_list[i]:
                num_filter.append(filtered_list[i])
                break

    for filter in num_filter:
        filtered_list.remove(filter)

    filtered_list = [w for w in filtered_list if w.isalnum()]
    filtered_list = [w for w in filtered_list if not w.isdigit()]

    # Lematizing
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_list = [
        wordnet_lemmatizer.lemmatize(w, wordnet.VERB) for w in filtered_list
    ]
    lemmatized_string = " ".join(lemmatized_list)

    return lemmatized_string

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the BBC News dataset
df = pd.read_csv("../dataset/BBC News Train.csv")

mapping = {"business": 0, "entertainment": 1, "politics": 2, "sport": 3, "tech": 4}
df["CategoryId"] = df["Category"].map(mapping)
df["Text"] = df["Text"].apply(preprocess_text)

# Display the first few rows of the dataset
print(df.head())


   ArticleId                                               Text  Category  \
0       1833  worldcom launch defence lawyers defend former ...  business   
1        154  german business confidence slide german busine...  business   
2       1101  bbc poll indicate economic gloom citizens majo...  business   
3       1976  lifestyle govern mobile choice faster better f...      tech   
4        917  enron boss payout eighteen former enron direct...  business   

   CategoryId  
0           0  
1           0  
2           0  
3           4  
4           0  


In [4]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
features = vectorizer.fit_transform(df["Text"]).toarray()

In [5]:
X = features
y = df.loc[:, "CategoryId"].values

In [6]:
from sklearn.preprocessing import LabelEncoder
import MySVM
import numpy as np


class OneVsOneSVM:
    """
    Parameters
    ----------
    C : float, optional (default=1.0)
    Penalty parameter C of the error term.

    max_iter : int, optional (default=6000)

    learning_rate : float, optional (default=0.00001)

    Callable Functions
    ------------------

    fit(X,Y) : take data as X, target_labels as Y as input and trains the SVM model

    score(X,Y) : take data as X, target_labels as Y as input and returns the accuracy of the model

    """

    def __init__(self, C=1.0, max_iter=6000, learning_rate=0.00001):
        self.max_iter = max_iter
        self.learning_rate = learning_rate
        self.C = C
        self.svm_classifiers = {}

    def generateClasswiseData(self, X, Y):
        data = {}

        no_of_classes = len(np.unique(Y))
        no_of_samples = X.shape[0]

        for i in range(no_of_classes):
            data[i] = []

        for i in range(no_of_samples):
            data[Y[i]].append(X[i])

        for k in range(no_of_classes):
            data[k] = np.array(data[k])

        return data

    def getPairData(self, d1, d2):

        l1 = d1.shape[0]
        l2 = d2.shape[0]
        data = np.zeros((l1 + l2, d1.shape[1]))
        labels = np.zeros(l1 + l2)

        data[:l1] = d1
        data[l1:] = d2

        labels[:l1] = 1
        labels[l1:] = -1

        return data, labels

    def fit(self, X, Y):
        global le
        le = LabelEncoder()
        le.fit(Y)
        Y = le.transform(Y)

        data = self.generateClasswiseData(X, Y)
        svc = MySVM.SVC(self.C)
        for i in range(len(data)):
            self.svm_classifiers[i] = {}
            for j in range(i + 1, len(np.unique(Y))):
                x, y = self.getPairData(data[i], data[j])
                wts, b, losses = svc.fit(
                    x, y, learning_rate=self.learning_rate, max_itr=self.max_iter
                )
                self.svm_classifiers[i][j] = (wts, b)

    def predict(self, X):
        X = np.array(X)
        classes = len(self.svm_classifiers)
        count = np.zeros(
            classes,
        )
        for i in range(classes):
            for j in range(i + 1, classes):
                W = self.svm_classifiers[i][j][0]
                b = self.svm_classifiers[i][j][1]
                if (np.dot(W, X.T) + b) >= 0:
                    count[i] += 1
                else:
                    count[j] += 1

        index = np.argmax(count)
        return le.inverse_transform([index])

    def score(self, X, Y):
        count = 0
        for i in range(X.shape[0]):
            if Y[i] == self.predict(X[i]):
                count += 1

        return count / X.shape[0]

In [7]:
# Initialize and train the OneVsOneSVM model
ovo_SVM = OneVsOneSVM(C=10, max_iter=6000, learning_rate=0.00001)
ovo_SVM.fit(X, y)

In [18]:
# Example custom news texts
business_news = "Global market trends indicate a significant rise in e-commerce investments. Major companies are expanding their online presence to capture growing digital consumer demand."
entertainment_news = "The latest blockbuster movie, directed by renowned filmmaker Jane Doe, has shattered box office records and received rave reviews from critics. Red carpet"
politics_news = "In a recent speech, leader michael, down street, council tax, tory leader, election campaign, mr kennedy Leader Government the President outlined new policies and constitutions aimed at improving national healthcare and addressing income inequality in the country."
sports_news = "The local soccer team secured a dramatic victory in the championship final, with a last-minute goal clinching the title against their long-time rivals."
tech_news = "Tech giants Computer are unveiling their newest smart phones featuring advanced AI capabilities and enhanced security features that promise to revolutionize the industry."

# Preprocess and vectorize the news texts
business_vectorized = vectorizer.transform([preprocess_text(business_news)]).toarray()
entertainment_vectorized = vectorizer.transform([preprocess_text(entertainment_news)]).toarray()
politics_vectorized = vectorizer.transform([preprocess_text(politics_news)]).toarray()
sports_vectorized = vectorizer.transform([preprocess_text(sports_news)]).toarray()
tech_vectorized = vectorizer.transform([preprocess_text(tech_news)]).toarray()

mapping = {0: "business", 1: "entertainment", 2: "politics", 3: "sports", 4: "tech"}

# Predict the category
predicted_business = ovo_SVM.predict(business_vectorized)[0]
predicted_entertainment = ovo_SVM.predict(entertainment_vectorized)[0]
predicted_politics = ovo_SVM.predict(politics_vectorized)[0]
predicted_sports = ovo_SVM.predict(sports_vectorized)[0]
predicted_tech = ovo_SVM.predict(tech_vectorized)[0]

# Print the predicted categories
print(f"Business news predicted as: {mapping[predicted_business]}")
print(f"Entertainment news predicted as: {mapping[predicted_entertainment]}")
print(f"Politics news predicted as: {mapping[predicted_politics]}")
print(f"Sports news predicted as: {mapping[predicted_sports]}")
print(f"Tech news predicted as: {mapping[predicted_tech]}")


Business news predicted as: business
Entertainment news predicted as: entertainment
Politics news predicted as: politics
Sports news predicted as: sports
Tech news predicted as: tech


In [9]:
# Evaluate the model
accuracy = ovo_SVM.score(X, y)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 99.46%


In [None]:
testing_data = pd.read_csv("../dataset/BBC News Test.csv")
testing_data["Text"] = testing_data["Text"].apply(preprocess_text)
X_test = vectorizer.transform(testing_data["Text"]).toarray()
y_test = testing_data.loc[:, "Category"].values


In [12]:
# Load the BBC News dataset
df = pd.read_csv("../dataset/BBC News Test.csv")

print(df.head())

   ArticleId                                               Text
0       1018  qpr keeper day heads for preston queens park r...
1       1319  software watching while you work software that...
2       1138  d arcy injury adds to ireland woe gordon d arc...
3        459  india s reliance family feud heats up the ongo...
4       1020  boro suffer morrison injury blow middlesbrough...


import 

In [20]:
import joblib

# Save the model to disk
filename = "OneVsOneSVM_model.pkl"
joblib.dump(ovo_SVM, filename)

# save the vecorizer
filename = "vectorizer.pkl"
joblib.dump(vectorizer, filename)



['vectorizer.pkl']