Import Libraries

In [13]:
#import libraries
import pandas as pd
import numpy as np
import linecache
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from decimal import Decimal, getcontext

Parse the mtx data into dataX and dataY where dataX is the term and dataY is the type of article

In [3]:

#for the emtpy matrix above, the rows represent the type of article and matrix[row][column] is the number of occurences of term #column in articles of type row
# row 0 <=> business, row 1 <=> entertainment, row 2 <=> politics, row 3 <=> sport, row 4 <=> tech
mtx = 'bbc.mtx'
docs = 'bbc.docs'

docs_file = open(docs,'r')
docs_content = docs_file.read()
docs_content = docs_content.split('\n')
dataX = []
dataY = []
with open(mtx,'r') as file:
    #skip first two metadata lines
    file.readline()
    file.readline()

    #now read rest of the file
 
    for line in file:
        '''Each row in bbc.mtx, except the first two, represents the frequency of a term in a given
        article. For example, row 812 (“2 528
        5.0”) indicates that term 2 (“sale”) occurs 5 times
        in article 528 (entertainment.018).'''
        words = line.split()
        term = int(words[0])
        article = int(words[1])

        article_type = docs_content[article-1].split('.')[0]
        article_type_lower = article_type.lower()  # Convert to lowercase for case-insensitivity

        if article_type_lower == "business":
            dataY.append(0)
        elif article_type_lower == "entertainment":
            dataY.append(1)
        elif article_type_lower == "politics":
            dataY.append(2)
        elif article_type_lower == "sport":
            dataY.append(3)
        elif article_type_lower == "tech":
            dataY.append(4)
        else:
            raise ValueError("Invalid article type: {}".format(article_type))
        dataX.append(term)

dataX = np.array(dataX)
dataY = np.array(dataY)

print(dataX.shape)
print(dataY.shape)
print(dataY)


(286774,)
(286774,)
[0 0 0 ... 4 4 4]


After having parsed the data from the files, split the data into training and test sets

In [25]:
x_train,x_test,y_train,y_test = train_test_split(dataX, dataY, test_size=0.2, random_state=1)

print("x_train.shape:", x_train.shape, "x_test:",x_test.shape, "y_train:",y_train.shape, "y_test:",y_test.shape)

print(y_train)
print(y_test)

x_train.shape: (229419,) x_test: (57355,) y_train: (229419,) y_test: (57355,)
[3 0 0 ... 3 3 0]
[4 1 3 ... 4 2 1]


Next, calculate the priors

In [26]:
occurences = np.array([0,0,0,0,0])
#index: 0 <=> business, 1 <=> entertainment, 2 <=> politics, 3 <=> sport, 4 <=> tech
for i in range(len(y_test)):
    occurences[y_test[i]] += 1
    
priors = occurences

print (priors)
        



[12062  8976 12196 11651 12470]


Next calculate P(x|Ci) with regularization where alpha = 1

In [27]:
probability_matrix = np.zeros((5,9635))

for i in range(len(x_train)):
    probability_matrix[y_train[i]][x_train[i]-1] += 1

for i in range(len(probability_matrix)):
    print(len(y_train[y_train == i]))
    probability_matrix[i]  = (probability_matrix[i]+1)/(len(y_train[y_train == i]) + 2)

print(np.sum(probability_matrix,axis = 1))



48056
35677
48468
46990
50228
[1.2004453  1.26999075 1.19874149 1.20499234 1.19177782]


Next predict the x_test_set

In [28]:
y_pred = np.zeros((x_test.shape))

for i in range(len(x_test)):
    probs = np.array([0,0,0,0,0])

    for j in range(5):
        probs[j] = priors[j] * probability_matrix[j][x_test[i]-1]
    y_pred[i] = np.argmax(probs)

accuracy = accuracy_score(y_test,y_pred)

print(accuracy)

0.4267631418359341


Next predict the x_train_set

In [29]:
y_pred = np.zeros((x_train.shape))

for i in range(len(x_train)):
    probs = np.array([0,0,0,0,0])

    for j in range(5):
        probs[j] = priors[j] * probability_matrix[j][x_train[i]-1]
    
    y_pred[i] = np.argmax(probs)

accuracy = accuracy_score(y_train,y_pred)

print(accuracy)

0.458131192272654
