### Import Libraries

In [1]:
import numpy as np
import pandas as pd

### Define the Linear SVM Classifier

In [2]:
class BinarySVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.lr = learning_rate
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.w) + self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.C * self.w)
                else:
                    self.w -= self.lr * (2 * self.C * self.w - np.dot(x_i, y[idx]))
                    self.b -= self.lr * y[idx]

            

    def predict(self, X):
        linear_output = np.dot(X, self.w) + self.b
        return np.sign(linear_output)


### OneVsOneSVM

In [3]:
class OneVsOneSVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.classifiers = {}

    def fit(self, X, y):
        self.classes = np.unique(y)
        for i in range(len(self.classes)):
            for j in range(i + 1, len(self.classes)):
                class_i = self.classes[i]
                class_j = self.classes[j]

                # Filter the dataset to include only class_i and class_j
                X_ij = X[(y == class_i) | (y == class_j)]
                y_ij = y[(y == class_i) | (y == class_j)]
                y_ij = np.where(y_ij == class_i, 1, -1)  # Assign labels +1 and -1

                # Train a binary SVM on this subset
                svm = BinarySVM(C=self.C, learning_rate=self.learning_rate, n_iters=self.n_iters)
                svm.fit(X_ij, y_ij)

                # Store the classifier
                self.classifiers[(class_i, class_j)] = svm

    def predict(self, X):
        votes = np.zeros((X.shape[0], len(self.classes)))

        for (class_i, class_j), svm in self.classifiers.items():
            predictions = svm.predict(X)
            for idx, prediction in enumerate(predictions):
                if prediction == 1:
                    votes[idx, np.where(self.classes == class_i)] += 1
                else:
                    votes[idx, np.where(self.classes == class_j)] += 1

        # Determine the final prediction by majority vote
        return self.classes[np.argmax(votes, axis=1)]


### Testing

In [4]:
# Example dataset
data = {
    'feature1': [2, 4, 4, 6, 6, 8],
    'feature2': [3, 3, 6, 5, 8, 7],
    'label': [0, 0, 1, 1, 2, 2]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Separate features and labels
X = df[['feature1', 'feature2']].values
y = df['label'].values

# Initialize and train the One-vs-One SVM
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)
ovo_svm.fit(X, y)

# Test the classifier
predictions = ovo_svm.predict(X)
print("Predictions:", predictions)
print("Actual Labels:", y)


Predictions: [2 2 2 2 2 2]
Actual Labels: [0 0 1 1 2 2]


In [5]:
import numpy as np
import pandas as pd
from collections import Counter
import math

def compute_tf(text):
    tf_text = Counter(text)
    for i in tf_text:
        tf_text[i] = tf_text[i] / float(len(text))
    return tf_text

def compute_idf(word, corpus):
    return math.log10(len(corpus) / (1 + sum([1 for text in corpus if word in text])))

def compute_tfidf(corpus):
    corpus = [doc.split() for doc in corpus]
    vocabulary = list(set(word for doc in corpus for word in doc))
    idf = {word: compute_idf(word, corpus) for word in vocabulary}
    tfidf = []

    for text in corpus:
        tf = compute_tf(text)
        tfidf.append([tf.get(word, 0) * idf[word] for word in vocabulary])
    
    return np.array(tfidf), vocabulary

# Example usage:
corpus = ["this is a sample", "this is another example example", "sample example of text"]
tfidf_matrix, vocabulary = compute_tfidf(corpus)
print("TF-IDF Matrix:\n", tfidf_matrix)
print("Vocabulary:\n", vocabulary)


TF-IDF Matrix:
 [[0.         0.04402281 0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.03521825 0.        ]
 [0.         0.         0.         0.04402281 0.         0.04402281
  0.         0.        ]]
Vocabulary:
 ['sample', 'a', 'this', 'text', 'is', 'of', 'another', 'example']


In [6]:
# Your dataset (replace this with actual data)
news_articles = [
    "government passes new law",
    "football match ends in draw",
    "new technology in smartphones",
    "politician gives a speech",
    "sports event attracts large crowd"
]
labels = [1, 2, 3, 1, 2]  # Example labels: 1=Politics, 2=Sports, 3=Technology

# Convert news articles to TF-IDF features
X, vocabulary = compute_tfidf(news_articles)
y = np.array(labels)

# Initialize and train the One-vs-One SVM
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)
ovo_svm.fit(X, y)

# Test the classifier on the training data
predictions = ovo_svm.predict(X)
print("Predictions:", predictions)
print("Actual Labels:", y)


Predictions: [3 3 3 3 3]
Actual Labels: [1 2 3 1 2]


In [7]:
df_train=pd.read_csv("../dataset/BBC News Train.csv")


In [8]:
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [9]:
Category_class=sorted(df_train["Category"].unique())
Category_class

['business', 'entertainment', 'politics', 'sport', 'tech']

In [10]:
mapping={'business':0, 'entertainment':1, 'politics':2, 'sport':3, 'tech':4}
df_train['CategoryId']=df_train['Category'].map(mapping)

In [11]:
X, vocabulary = compute_tfidf(df_train['Text'])

In [12]:
y = df_train['CategoryId'].values

In [13]:
# Initialize and train the One-vs-One SVM
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)
ovo_svm.fit(X, y)

In [14]:
# Test the classifier on the training data
predictions = ovo_svm.predict(X)
print("Predictions:", predictions)
print("Actual Labels:", y)


Predictions: [4 4 4 ... 4 4 4]
Actual Labels: [0 0 0 ... 0 4 4]


In [15]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y, predictions)
print(accuracy)

0.17516778523489934


In [17]:
import numpy as np
import pandas as pd

# Helper function to compute the TF-IDF (simplified)
def compute_tfidf(corpus):
    from collections import Counter
    from math import log

    vocabulary = list(set(word for document in corpus for word in document.split()))
    vocabulary.sort()
    tfidf_matrix = np.zeros((len(corpus), len(vocabulary)))

    for i, document in enumerate(corpus):
        word_counts = Counter(document.split())
        for word, count in word_counts.items():
            tf = count / len(document.split())
            idf = log(len(corpus) / sum(1 for doc in corpus if word in doc.split()))
            tfidf_matrix[i, vocabulary.index(word)] = tf * idf

    return tfidf_matrix, vocabulary

# Binary SVM Classifier
class BinarySVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for iteration in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
                if condition:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights - np.dot(x_i, y[idx]))
                    self.bias -= self.learning_rate * y[idx]

            # Debugging: print weights and bias at every 100 iterations
            if iteration % 100 == 0:
                print(f"Iteration {iteration}: Weights: {self.weights}, Bias: {self.bias}")

    def predict(self, X):
        return np.sign(np.dot(X, self.weights) - self.bias)

# One-vs-One SVM Classifier
class OneVsOneSVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.classifiers = []

    def fit(self, X, y):
        unique_classes = np.unique(y)
        for i in range(len(unique_classes)):
            for j in range(i + 1, len(unique_classes)):
                class_i = unique_classes[i]
                class_j = unique_classes[j]

                # Filter the data for the two classes
                idx = np.where((y == class_i) | (y == class_j))
                X_filtered = X[idx]
                y_filtered = y[idx]

                # Convert class labels to +1 and -1
                y_filtered = np.where(y_filtered == class_i, 1, -1)

                # Train the binary classifier
                clf = BinarySVM(C=self.C, learning_rate=self.learning_rate, n_iters=self.n_iters)
                clf.fit(X_filtered, y_filtered)
                self.classifiers.append((clf, (class_i, class_j)))

    def predict(self, X):
        votes = np.zeros((X.shape[0], len(np.unique([c for pair in [x[1] for x in self.classifiers] for c in pair]))))

        for clf, class_labels in self.classifiers:
            predictions = clf.predict(X)
            print(f"Classifier {class_labels} predictions: {predictions}")  # Debugging line
            for idx, pred in enumerate(predictions):
                if pred == 1:
                    votes[idx, class_labels[0]] += 1
                else:
                    votes[idx, class_labels[1]] += 1

        # Return the class with the most votes
        final_predictions = np.argmax(votes, axis=1)
        print(f"Votes: {votes}")  # Debugging line
        print(f"Final predictions: {final_predictions}")  # Debugging line
        return final_predictions

# Example usage:

# Sample dataset
news_articles = [
    "government passes new law",                     # Politics
    "football match ends in draw",                    # Sports
    "new technology in smartphones",                  # Technology
    "politician gives a speech",                      # Politics
    "sports event attracts large crowd",              # Sports
    "new tech gadgets released this year",            # Technology
    "election results announced",                     # Politics
    "soccer team wins championship",                  # Sports
    "AI advancements in healthcare",                  # Technology
    "local council meeting updates",                  # Politics
    "basketball game highlights",                     # Sports
    "innovations in artificial intelligence",         # Technology
    "politician's new policy proposal",               # Politics
    "swimming competition results",                   # Sports
    "latest trends in smartphone design",             # Technology
    "government budget allocation review",            # Politics
    "volleyball tournament concludes",                # Sports
    "breakthrough in renewable energy",               # Technology
    "senator's speech on climate change",             # Politics
    "baseball team training camp",                    # Sports
    "tech company announces new software",            # Technology
    "international relations summit",                 # Politics
    "world cup qualifying matches",                   # Sports
    "smart home devices market growth",               # Technology
    "legislative bill discussion",                    # Politics
    "rugby game results",                            # Sports
    "technology in education sector",                # Technology
    "mayoral election debate",                        # Politics
    "national soccer league season start",            # Sports
    "advancements in quantum computing",              # Technology
    "press conference on new laws",                   # Politics
    "hockey match final scores",                      # Sports
    "virtual reality applications",                   # Technology
    "parliamentary debate on economy",                # Politics
    "college basketball championship",                # Sports
    "latest trends in gadget development",            # Technology
    "congressional committee meeting",                # Politics
    "tennis tournament highlights",                   # Sports
    "emerging technologies in fintech",               # Technology
    "state of the union address",                     # Politics
    "motorsports event results",                      # Sports
    "new innovations in medical tech",                # Technology
    "political rally speeches",                       # Politics
    "community sports league updates",                # Sports
    "tech industry conference news",                  # Technology
    "government response to natural disaster",        # Politics
    "annual sports awards ceremony",                  # Sports
    "technological impacts on job market"             # Technology
]

labels = [
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3   # Technology
]


# Convert news articles to TF-IDF features
X, vocabulary = compute_tfidf(news_articles)
y = np.array(labels)

# Initialize and train the One-vs-One SVM
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)
ovo_svm.fit(X, y)

# Test the classifier on the training data
predictions = ovo_svm.predict(X)
print("Predictions:", predictions)
print("Actual Labels:", y)


Iteration 0: Weights: [ 0.00000000e+00  9.16875453e-04  7.69604046e-04  0.00000000e+00
  9.31678376e-04  1.22740532e-03  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -7.34970303e-04  0.00000000e+00
 -9.37290881e-04 -2.06040819e-03  1.25724960e-03  0.00000000e+00
  9.31678376e-04 -9.37290881e-04  0.00000000e+00 -1.80394368e-03
  7.48333039e-04  7.48333039e-04 -1.27499275e-03  1.27754785e-03
 -9.67800253e-04  0.00000000e+00 -1.23975336e-03  0.00000000e+00
 -1.24472729e-03  6.24260874e-04  1.27754785e-03  9.24247279e-04
 -7.34970303e-04 -9.41051322e-04  1.81973400e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.25724960e-03
 -7.32033362e-04  9.54332074e-04  0.00000000e+00  2.04391287e-03
  0.00000000e+00 -7.32033362e-04  0.00000000e+00 -1.65849034e-03
 -9.52423410e-04  0.00000000e+00 -7.32033362e-04  0.00000000e+00
  0.00000000e+00 -2.04791039e-03  9.16875453e-04  1.32132668e-03
  0.00000000e+00  0.00000000e+00 -2.06460759e-03 -9.52423410e-04
  0

IndexError: index 3 is out of bounds for axis 1 with size 3

In [28]:
import numpy as np
import pandas as pd

# Helper function to compute the TF-IDF (simplified)
def compute_tfidf(corpus):
    from collections import Counter
    from math import log

    vocabulary = list(set(word for document in corpus for word in document.split()))
    vocabulary.sort()
    tfidf_matrix = np.zeros((len(corpus), len(vocabulary)))

    for i, document in enumerate(corpus):
        word_counts = Counter(document.split())
        for word, count in word_counts.items():
            tf = count / len(document.split())
            idf = log(len(corpus) / sum(1 for doc in corpus if word in doc.split()))
            tfidf_matrix[i, vocabulary.index(word)] = tf * idf

    return tfidf_matrix, vocabulary

# Binary SVM Classifier
class BinarySVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for iteration in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
                if condition:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights - np.dot(x_i, y[idx]))
                    self.bias -= self.learning_rate * y[idx]

            # Debugging: print weights and bias at every 100 iterations
            if iteration % 100 == 0:
                print(f"Iteration {iteration}: Weights: {self.weights}, Bias: {self.bias}")

    def predict(self, X):
        return np.sign(np.dot(X, self.weights) - self.bias)

# One-vs-One SVM Classifier
class OneVsOneSVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.classifiers = []

    def fit(self, X, y):
        unique_classes = np.unique(y)
        n_classes = len(unique_classes)
        print(f"Unique classes: {unique_classes}")  # Debugging line
        for i in range(n_classes):
            for j in range(i + 1, n_classes):
                class_i = unique_classes[i]
                class_j = unique_classes[j]

                # Filter the data for the two classes
                idx = np.where((y == class_i) | (y == class_j))
                X_filtered = X[idx]
                y_filtered = y[idx]

                # Convert class labels to +1 and -1
                y_filtered = np.where(y_filtered == class_i, 1, -1)

                # Train the binary classifier
                clf = BinarySVM(C=self.C, learning_rate=self.learning_rate, n_iters=self.n_iters)
                clf.fit(X_filtered, y_filtered)
                self.classifiers.append((clf, (class_i, class_j)))

    def predict(self, X):
        n_classes = len(np.unique(y))
        votes = np.zeros((X.shape[0], n_classes))

        for clf, class_labels in self.classifiers:
            predictions = clf.predict(X)
            print(f"Classifier {class_labels} predictions: {predictions}")  # Debugging line
            for idx, pred in enumerate(predictions):
                if pred == 1:
                    votes[idx, class_labels[0] - 1] += 1  # Adjust index for zero-based array
                else:
                    votes[idx, class_labels[1] - 1] += 1  # Adjust index for zero-based array

        # Return the class with the most votes
        final_predictions = np.argmax(votes, axis=1)
        print(f"Votes: {votes}")  # Debugging line
        print(f"Final predictions: {final_predictions}")  # Debugging line
        return final_predictions

# Example usage:

# Sample dataset
news_articles = [
    "government passes new law",                     # Politics
    "football match ends in draw",                    # Sports
    "new technology in smartphones",                  # Technology
    "politician gives a speech",                      # Politics
    "sports event attracts large crowd",              # Sports
    "new tech gadgets released this year",            # Technology
    "election results announced",                     # Politics
    "soccer team wins championship",                  # Sports
    "AI advancements in healthcare",                  # Technology
    "local council meeting updates",                  # Politics
    "basketball game highlights",                     # Sports
    "innovations in artificial intelligence",         # Technology
    "politician's new policy proposal",               # Politics
    "swimming competition results",                   # Sports
    "latest trends in smartphone design",             # Technology
    "government budget allocation review",            # Politics
    "volleyball tournament concludes",                # Sports
    "breakthrough in renewable energy",               # Technology
    "senator's speech on climate change",             # Politics
    "baseball team training camp",                    # Sports
    "tech company announces new software",            # Technology
    "international relations summit",                 # Politics
    "world cup qualifying matches",                   # Sports
    "smart home devices market growth",               # Technology
    "legislative bill discussion",                    # Politics
    "rugby game results",                            # Sports
    "technology in education sector",                # Technology
    "mayoral election debate",                        # Politics
    "national soccer league season start",            # Sports
    "advancements in quantum computing",              # Technology
    "press conference on new laws",                   # Politics
    "hockey match final scores",                      # Sports
    "virtual reality applications",                   # Technology
    "parliamentary debate on economy",                # Politics
    "college basketball championship",                # Sports
    "latest trends in gadget development",            # Technology
    "congressional committee meeting",                # Politics
    "tennis tournament highlights",                   # Sports
    "emerging technologies in fintech",               # Technology
    "state of the union address",                     # Politics
    "motorsports event results",                      # Sports
    "new innovations in medical tech",                # Technology
    "political rally speeches",                       # Politics
    "community sports league updates",                # Sports
    "tech industry conference news",                  # Technology
    "government response to natural disaster",        # Politics
    "annual sports awards ceremony",                  # Sports
    "technological impacts on job market"             # Technology
]


labels = [
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
     2, # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,   # Technology
    1,  # Politics
    2,  # Sports
    3,   # Technology
]

# Convert news articles to TF-IDF features
X, vocabulary = compute_tfidf(news_articles)
y = np.array(labels)

# Initialize and train the One-vs-One SVM
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)
ovo_svm.fit(X, y)

# Test the classifier on the training data
predictions = ovo_svm.predict(X)
print("Predictions:", predictions)
for i in range(len(predictions)):
    predictions[i]+=1
print("Actual Labels:", y)



Unique classes: [1 2 3]
Iteration 0: Weights: [ 0.00000000e+00  9.13211619e-04  7.66528708e-04  0.00000000e+00
  9.27955389e-04  1.22250060e-03  0.00000000e+00 -9.67800253e-04
  0.00000000e+00  0.00000000e+00 -7.32033362e-04 -9.67800253e-04
 -9.33545467e-04 -2.05217480e-03  1.25222563e-03  0.00000000e+00
  9.27955389e-04 -9.33545467e-04 -9.67800253e-04 -1.79673512e-03
  7.45342701e-04  7.45342701e-04 -1.26989788e-03  1.27244277e-03
 -9.63932923e-04  0.00000000e+00 -1.23479930e-03  0.00000000e+00
 -1.23975336e-03  6.21766328e-04  1.27244277e-03  9.20553987e-04
 -7.32033362e-04 -9.37290881e-04  1.81246234e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  7.72691722e-04  1.25222563e-03
 -7.29108157e-04  9.50518563e-04  0.00000000e+00  2.03574539e-03
  0.00000000e+00 -7.29108157e-04  0.00000000e+00 -1.65186301e-03
 -9.48617526e-04  0.00000000e+00 -7.29108157e-04  0.00000000e+00
  0.00000000e+00 -2.03972694e-03  9.13211619e-04  1.86945537e-03
  0.00000000e+00  0.00000000e+00 -2.05635742

In [34]:
New_Text = 'national soccer league season start'
X_new, vocabulary = compute_tfidf(New_Text)
ovo_svm.predict(X_new)

ValueError: shapes (35,12) and (142,) not aligned: 12 (dim 1) != 142 (dim 0)

In [29]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y, predictions)
print(accuracy)

1.0


In [36]:
import numpy as np
import pandas as pd
from collections import Counter
from math import log
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    
    # Remove stop words
    filtered_list = [w for w in word_tokens if not w in stop_words]
    
    # Remove numbers and special symbols
    filtered_list = [w for w in filtered_list if w.isalnum() and not w.isdigit()]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_list = [lemmatizer.lemmatize(w, 'v') for w in filtered_list]
    
    return ' '.join(lemmatized_list)

# Compute TF-IDF
def compute_tfidf(corpus):
    vocabulary = list(set(word for document in corpus for word in document.split()))
    vocabulary.sort()
    tfidf_matrix = np.zeros((len(corpus), len(vocabulary)))

    for i, document in enumerate(corpus):
        word_counts = Counter(document.split())
        for word, count in word_counts.items():
            tf = count / len(document.split())
            idf = log(len(corpus) / sum(1 for doc in corpus if word in doc.split()))
            tfidf_matrix[i, vocabulary.index(word)] = tf * idf

    return tfidf_matrix, vocabulary

# Binary SVM Classifier
class BinarySVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for iteration in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
                if condition:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights - np.dot(x_i, y[idx]))
                    self.bias -= self.learning_rate * y[idx]

            # Debugging: print weights and bias at every 100 iterations
            if iteration % 100 == 0:
                print(f"Iteration {iteration}: Weights: {self.weights}, Bias: {self.bias}")

    def predict(self, X):
        return np.sign(np.dot(X, self.weights) - self.bias)

# One-vs-One SVM Classifier
class OneVsOneSVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.classifiers = []

    def fit(self, X, y):
        unique_classes = np.unique(y)
        n_classes = len(unique_classes)
        print(f"Unique classes: {unique_classes}")  # Debugging line
        for i in range(n_classes):
            for j in range(i + 1, n_classes):
                class_i = unique_classes[i]
                class_j = unique_classes[j]

                # Filter the data for the two classes
                idx = np.where((y == class_i) | (y == class_j))
                X_filtered = X[idx]
                y_filtered = y[idx]

                # Convert class labels to +1 and -1
                y_filtered = np.where(y_filtered == class_i, 1, -1)

                # Train the binary classifier
                clf = BinarySVM(C=self.C, learning_rate=self.learning_rate, n_iters=self.n_iters)
                clf.fit(X_filtered, y_filtered)
                self.classifiers.append((clf, (class_i, class_j)))

    def predict(self, X):
        n_classes = len(np.unique(y))
        votes = np.zeros((X.shape[0], n_classes))

        for clf, class_labels in self.classifiers:
            predictions = clf.predict(X)
            print(f"Classifier {class_labels} predictions: {predictions}")  # Debugging line
            for idx, pred in enumerate(predictions):
                if pred == 1:
                    votes[idx, class_labels[0] - 1] += 1  # Adjust index for zero-based array
                else:
                    votes[idx, class_labels[1] - 1] += 1  # Adjust index for zero-based array

        # Return the class with the most votes
        final_predictions = np.argmax(votes, axis=1)
        print(f"Votes: {votes}")  # Debugging line
        print(f"Final predictions: {final_predictions}")  # Debugging line
        return final_predictions

# Sample dataset
news_articles = [
    "government passes new law",                     # Politics
    "football match ends in draw",                    # Sports
    "new technology in smartphones",                  # Technology
    "politician gives a speech",                      # Politics
    "sports event attracts large crowd",              # Sports
    "new tech gadgets released this year",            # Technology
    "election results announced",                     # Politics
    "soccer team wins championship",                  # Sports
    "AI advancements in healthcare",                  # Technology
    "local council meeting updates",                  # Politics
    "basketball game highlights",                     # Sports
    "innovations in artificial intelligence",         # Technology
    "politician's new policy proposal",               # Politics
    "swimming competition results",                   # Sports
    "latest trends in smartphone design",             # Technology
    "government budget allocation review",            # Politics
    "volleyball tournament concludes",                # Sports
    "breakthrough in renewable energy",               # Technology
    "senator's speech on climate change",             # Politics
    "baseball team training camp",                    # Sports
    "tech company announces new software",            # Technology
    "international relations summit",                 # Politics
    "world cup qualifying matches",                   # Sports
    "smart home devices market growth",               # Technology
    "legislative bill discussion",                    # Politics
    "rugby game results",                            # Sports
    "technology in education sector",                # Technology
    "mayoral election debate",                        # Politics
    "national soccer league season start",            # Sports
    "advancements in quantum computing",              # Technology
    "press conference on new laws",                   # Politics
    "hockey match final scores",                      # Sports
    "virtual reality applications",                   # Technology
    "parliamentary debate on economy",                # Politics
    "college basketball championship",                # Sports
    "latest trends in gadget development",            # Technology
    "congressional committee meeting",                # Politics
    "tennis tournament highlights",                   # Sports
    "emerging technologies in fintech",               # Technology
    "state of the union address",                     # Politics
    "motorsports event results",                      # Sports
    "new innovations in medical tech",                # Technology
    "political rally speeches",                       # Politics
    "community sports league updates",                # Sports
    "tech industry conference news",                  # Technology
    "government response to natural disaster",        # Politics
    "annual sports awards ceremony",                  # Sports
    "technological impacts on job market"             # Technology
]

labels = [
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
]

# Convert news articles to TF-IDF features
preprocessed_articles = [preprocess_text(article) for article in news_articles]
X, vocabulary = compute_tfidf(preprocessed_articles)
y = np.array(labels)

# Initialize and train the One-vs-One SVM
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)
ovo_svm.fit(X, y)

# Test the classifier on the training data
predictions = ovo_svm.predict(X)
print("Predictions:", predictions)
print("Actual Labels:", y)


Unique classes: [1 2 3]
Iteration 0: Weights: [ 0.00128782  0.          0.          0.00093542  0.00101168  0.
  0.          0.         -0.00073792  0.         -0.00094105 -0.00206867
  0.00126229  0.          0.00093542 -0.00094105  0.         -0.00181118
  0.00093917  0.00093917 -0.00128011  0.00128267  0.          0.
 -0.00124473  0.         -0.00124972  0.00078346  0.00128267  0.00092796
 -0.00073792 -0.00094483  0.00208923  0.          0.          0.
  0.          0.00126229 -0.00091871  0.00127755  0.          0.00205211
  0.         -0.00091871  0.         -0.00166514 -0.00095624  0.
 -0.00091871  0.          0.         -0.00205613  0.00122741  0.00132663
  0.          0.         -0.00207289 -0.00095624  0.          0.
  0.          0.          0.          0.00125725  0.         -0.00073792
  0.          0.00091688  0.00095433 -0.00062551  0.00126229  0.00092796
  0.         -0.00201955  0.00126736  0.          0.00181481 -0.0012904
 -0.00076194  0.          0.00139398  0.      

In [37]:
import numpy as np
import pandas as pd
from collections import Counter
from math import log
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    
    # Remove stop words
    filtered_list = [w for w in word_tokens if not w in stop_words]
    
    # Remove numbers and special symbols
    filtered_list = [w for w in filtered_list if w.isalnum() and not w.isdigit()]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_list = [lemmatizer.lemmatize(w, 'v') for w in filtered_list]
    
    return ' '.join(lemmatized_list)

# Compute TF-IDF
def compute_tfidf(corpus):
    vocabulary = list(set(word for document in corpus for word in document.split()))
    vocabulary.sort()
    tfidf_matrix = np.zeros((len(corpus), len(vocabulary)))

    for i, document in enumerate(corpus):
        word_counts = Counter(document.split())
        for word, count in word_counts.items():
            tf = count / len(document.split())
            idf = log(len(corpus) / sum(1 for doc in corpus if word in doc.split()))
            tfidf_matrix[i, vocabulary.index(word)] = tf * idf

    return tfidf_matrix, vocabulary

# Binary SVM Classifier
class BinarySVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for iteration in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
                if condition:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights - np.dot(x_i, y[idx]))
                    self.bias -= self.learning_rate * y[idx]

            # Debugging: print weights and bias at every 100 iterations
            if iteration % 100 == 0:
                print(f"Iteration {iteration}: Weights: {self.weights}, Bias: {self.bias}")

    def predict(self, X):
        return np.sign(np.dot(X, self.weights) - self.bias)

# One-vs-One SVM Classifier
class OneVsOneSVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.classifiers = []

    def fit(self, X, y):
        unique_classes = np.unique(y)
        n_classes = len(unique_classes)
        print(f"Unique classes: {unique_classes}")  # Debugging line
        for i in range(n_classes):
            for j in range(i + 1, n_classes):
                class_i = unique_classes[i]
                class_j = unique_classes[j]

                # Filter the data for the two classes
                idx = np.where((y == class_i) | (y == class_j))
                X_filtered = X[idx]
                y_filtered = y[idx]

                # Convert class labels to +1 and -1
                y_filtered = np.where(y_filtered == class_i, 1, -1)

                # Train the binary classifier
                clf = BinarySVM(C=self.C, learning_rate=self.learning_rate, n_iters=self.n_iters)
                clf.fit(X_filtered, y_filtered)
                self.classifiers.append((clf, (class_i, class_j)))

    def predict(self, X):
        n_classes = len(np.unique(y))
        votes = np.zeros((X.shape[0], n_classes))

        for clf, class_labels in self.classifiers:
            predictions = clf.predict(X)
            print(f"Classifier {class_labels} predictions: {predictions}")  # Debugging line
            for idx, pred in enumerate(predictions):
                if pred == 1:
                    votes[idx, class_labels[0] - 1] += 1  # Adjust index for zero-based array
                else:
                    votes[idx, class_labels[1] - 1] += 1  # Adjust index for zero-based array

        # Return the class with the most votes
        final_predictions = np.argmax(votes, axis=1) + 1  # Adjust index to match class labels starting from 1
        print(f"Votes: {votes}")  # Debugging line
        print(f"Final predictions: {final_predictions}")  # Debugging line
        return final_predictions

# Sample dataset
news_articles = [
    "government passes new law",                     # Politics
    "football match ends in draw",                    # Sports
    "new technology in smartphones",                  # Technology
    "politician gives a speech",                      # Politics
    "sports event attracts large crowd",              # Sports
    "new tech gadgets released this year",            # Technology
    "election results announced",                     # Politics
    "soccer team wins championship",                  # Sports
    "AI advancements in healthcare",                  # Technology
    "local council meeting updates",                  # Politics
    "basketball game highlights",                     # Sports
    "innovations in artificial intelligence",         # Technology
    "politician's new policy proposal",               # Politics
    "swimming competition results",                   # Sports
    "latest trends in smartphone design",             # Technology
    "government budget allocation review",            # Politics
    "volleyball tournament concludes",                # Sports
    "breakthrough in renewable energy",               # Technology
    "senator's speech on climate change",             # Politics
    "baseball team training camp",                    # Sports
    "tech company announces new software",            # Technology
    "international relations summit",                 # Politics
    "world cup qualifying matches",                   # Sports
    "smart home devices market growth",               # Technology
    "legislative bill discussion",                    # Politics
    "rugby game results",                            # Sports
    "technology in education sector",                # Technology
    "mayoral election debate",                        # Politics
    "national soccer league season start",            # Sports
    "advancements in quantum computing",              # Technology
    "press conference on new laws",                   # Politics
    "hockey match final scores",                      # Sports
    "virtual reality applications",                   # Technology
    "parliamentary debate on economy",                # Politics
    "college basketball championship",                # Sports
    "latest trends in gadget development",            # Technology
    "congressional committee meeting",                # Politics
    "tennis tournament highlights",                   # Sports
    "emerging technologies in fintech",               # Technology
    "state of the union address",                     # Politics
    "motorsports event results",                      # Sports
    "new innovations in medical tech",                # Technology
    "political rally speeches",                       # Politics
    "community sports league updates",                # Sports
    "tech industry conference news",                  # Technology
    "government response to natural disaster",        # Politics
    "annual sports awards ceremony",                  # Sports
    "technological impacts on job market"             # Technology
]

labels = [
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3   # Technology
]

# Convert news articles to TF-IDF features
preprocessed_articles = [preprocess_text(article) for article in news_articles]
X, vocabulary = compute_tfidf(preprocessed_articles)
y = np.array(labels)

# Initialize and train the One-vs-One SVM
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)
ovo_svm.fit(X, y)

# Test the classifier on the training data
predictions = ovo_svm.predict(X)
print("Predictions:", predictions)
print("Actual Labels:", y)


Unique classes: [1 2 3]
Iteration 0: Weights: [ 1.28267341e-03  0.00000000e+00  0.00000000e+00  9.31678376e-04
  1.00763566e-03  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -7.34970303e-04  0.00000000e+00 -9.37290881e-04 -2.06040819e-03
  1.25724960e-03  0.00000000e+00  9.31678376e-04 -9.37290881e-04
  0.00000000e+00 -1.80394368e-03  9.35416299e-04  9.35416299e-04
 -1.27499275e-03  1.27754785e-03 -9.67800253e-04  0.00000000e+00
 -1.23975336e-03  0.00000000e+00 -1.24472729e-03  7.80326093e-04
  1.27754785e-03  9.24247279e-04 -7.34970303e-04 -9.41051322e-04
  2.08088626e-03  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.25724960e-03 -9.15041702e-04  1.27244277e-03
  0.00000000e+00  2.04391287e-03  0.00000000e+00 -9.15041702e-04
  0.00000000e+00 -1.65849034e-03 -9.52423410e-04  0.00000000e+00
 -9.15041702e-04  0.00000000e+00  0.00000000e+00 -2.04791039e-03
  1.22250060e-03  1.32132668e-03  0.00000000e+00  0.00000000e+00
 -2.06460759e-03 -9.52423410e-04  0.00000000

In [42]:
import numpy as np
import pandas as pd
from collections import Counter
from math import log
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    
    # Remove stop words
    filtered_list = [w for w in word_tokens if not w in stop_words]
    
    # Remove numbers and special symbols
    filtered_list = [w for w in filtered_list if w.isalnum() and not w.isdigit()]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_list = [lemmatizer.lemmatize(w, 'v') for w in filtered_list]
    
    return ' '.join(lemmatized_list)

# Compute TF-IDF
def compute_tfidf(corpus):
    vocabulary = list(set(word for document in corpus for word in document.split()))
    vocabulary.sort()
    tfidf_matrix = np.zeros((len(corpus), len(vocabulary)))

    for i, document in enumerate(corpus):
        word_counts = Counter(document.split())
        for word, count in word_counts.items():
            tf = count / len(document.split())
            idf = log(len(corpus) / sum(1 for doc in corpus if word in doc.split()))
            tfidf_matrix[i, vocabulary.index(word)] = tf * idf

    return tfidf_matrix, vocabulary

# Binary SVM Classifier
class BinarySVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for iteration in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.weights) - self.bias) >= 1
                if condition:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights)
                else:
                    self.weights -= self.learning_rate * (2 * self.C * self.weights - np.dot(x_i, y[idx]))
                    self.bias -= self.learning_rate * y[idx]

            # Debugging: print weights and bias at every 100 iterations
            if iteration % 100 == 0:
                print(f"Iteration {iteration}: Weights: {self.weights}, Bias: {self.bias}")

    def predict(self, X):
        return np.sign(np.dot(X, self.weights) - self.bias)

# One-vs-One SVM Classifier
class OneVsOneSVM:
    def __init__(self, C=1.0, learning_rate=0.001, n_iters=1000):
        self.C = C
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.classifiers = []

    def fit(self, X, y):
        unique_classes = np.unique(y)
        n_classes = len(unique_classes)
        print(f"Unique classes: {unique_classes}")  # Debugging line
        for i in range(n_classes):
            for j in range(i + 1, n_classes):
                class_i = unique_classes[i]
                class_j = unique_classes[j]

                # Filter the data for the two classes
                idx = np.where((y == class_i) | (y == class_j))
                X_filtered = X[idx]
                y_filtered = y[idx]

                # Convert class labels to +1 and -1
                y_filtered = np.where(y_filtered == class_i, 1, -1)

                # Train the binary classifier
                clf = BinarySVM(C=self.C, learning_rate=self.learning_rate, n_iters=self.n_iters)
                clf.fit(X_filtered, y_filtered)
                self.classifiers.append((clf, (class_i, class_j)))

    def predict(self, X):
        n_classes = len(np.unique(y))
        votes = np.zeros((X.shape[0], n_classes))

        for clf, class_labels in self.classifiers:
            predictions = clf.predict(X)
            print(f"Classifier {class_labels} predictions: {predictions}")  # Debugging line
            for idx, pred in enumerate(predictions):
                if pred == 1:
                    votes[idx, class_labels[0] - 1] += 1  # Adjust index for zero-based array
                else:
                    votes[idx, class_labels[1] - 1] += 1  # Adjust index for zero-based array

        # Return the class with the most votes
        final_predictions = np.argmax(votes, axis=1) + 1  # Adjust index to match class labels starting from 1
        print(f"Votes: {votes}")  # Debugging line
        print(f"Final predictions: {final_predictions}")  # Debugging line
        return final_predictions

# Sample dataset
news_articles = [
    "government passes new law",                     # Politics
    "football match ends in draw",                    # Sports
    "new technology in smartphones",                  # Technology
    "politician gives a speech",                      # Politics
    "sports event attracts large crowd",              # Sports
    "new tech gadgets released this year",            # Technology
    "election results announced",                     # Politics
    "soccer team wins championship",                  # Sports
    "AI advancements in healthcare",                  # Technology
    "local council meeting updates",                  # Politics
    "basketball game highlights",                     # Sports
    "innovations in artificial intelligence",         # Technology
    "politician's new policy proposal",               # Politics
    "swimming competition results",                   # Sports
    "latest trends in smartphone design",             # Technology
    "government budget allocation review",            # Politics
    "volleyball tournament concludes",                # Sports
    "breakthrough in renewable energy",               # Technology
    "senator's speech on climate change",             # Politics
    "baseball team training camp",                    # Sports
    "tech company announces new software",            # Technology
    "international relations summit",                 # Politics
    "world cup qualifying matches",                   # Sports
    "smart home devices market growth",               # Technology
    "legislative bill discussion",                    # Politics
    "rugby game results",                            # Sports
    "technology in education sector",                # Technology
    "mayoral election debate",                        # Politics
    "national soccer league season start",            # Sports
    "advancements in quantum computing",              # Technology
    "press conference on new laws",                   # Politics
    "hockey match final scores",                      # Sports
    "virtual reality applications",                   # Technology
    "parliamentary debate on economy",                # Politics
    "college basketball championship",                # Sports
    "latest trends in gadget development",            # Technology
    "congressional committee meeting",                # Politics
    "tennis tournament highlights",                   # Sports
    "emerging technologies in fintech",               # Technology
    "state of the union address",                     # Politics
    "motorsports event results",                      # Sports
    "new innovations in medical tech",                # Technology
    "political rally speeches",                       # Politics
    "community sports league updates",                # Sports
    "tech industry conference news",                  # Technology
    "government response to natural disaster",        # Politics
    "annual sports awards ceremony",                  # Sports
    "technological impacts on job market"             # Technology
]

labels = [
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3,  # Technology
    1,  # Politics
    2,  # Sports
    3   # Technology
]

# Preprocess and convert to TF-IDF
preprocessed_articles = [preprocess_text(article) for article in news_articles]
X, vocabulary = compute_tfidf(preprocessed_articles)
y = np.array(labels)

# Initialize and train the One-vs-One SVM
ovo_svm = OneVsOneSVM(C=1.0, learning_rate=0.001, n_iters=1000)
ovo_svm.fit(X, y)

# Test the classifier on the training data
predictions = ovo_svm.predict(X)
print("Predictions:", predictions)
print("Actual Labels:", y)

Unique classes: [1 2 3]
Iteration 0: Weights: [ 0.          0.          0.          0.00093917  0.00101574  0.
  0.          0.         -0.00074088  0.         -0.00094483 -0.00207697
  0.00126736  0.          0.00093917 -0.00094483  0.         -0.00181845
  0.00094294  0.00094294 -0.00128524  0.00128782  0.          0.
 -0.00124972  0.         -0.00125474  0.0007866   0.00128782  0.00093168
 -0.00074088 -0.00094862  0.00209762  0.          0.          0.
  0.          0.00126736 -0.0009224   0.00128267  0.          0.00206035
  0.         -0.0009224   0.         -0.00060822 -0.00096008  0.
 -0.0009224   0.          0.         -0.00206438  0.00123233  0.00133195
  0.          0.         -0.00208121 -0.00096008  0.          0.
  0.          0.          0.          0.00126229  0.         -0.00074088
  0.          0.00092055  0.00095816 -0.00062802  0.00126736  0.00093168
  0.         -0.00202766  0.00127244  0.          0.00182209  0.
 -0.000765    0.          0.00139957  0.          0.0