In [152]:
import os
import math
import random
import sys
from nltk.corpus import stopwords

In [153]:
# Function to read in the data from the directory
def read_data(data_dir):
    data = []
    for category in os.listdir(data_dir):
        category_path = os.path.join(data_dir, category)
        for document in os.listdir(category_path):
            with open(os.path.join(category_path, document), 'rb') as f:
                content = f.read().decode(errors='ignore')
                data.append((content, category))
    return data

In [154]:
# Function to split the data into training and testing sets
def split_data(data):
    l = len(data)
    random.shuffle(data)
    train_data = data[:round(0.5*l)]
    test_data = data[round(0.5*l):]
    return train_data, test_data

In [155]:
# Function to preprocess the text data
def preprocess_data(data):
    preprocessed_data = {}
    for document, category in data:
        words = [i.lower() for i in document.split() if i.isalnum() and i not in s]
        if category not in preprocessed_data.keys():
            preprocessed_data[category] = []
        preprocessed_data[category].extend(words)
    return preprocessed_data

In [156]:
# Function to compute the vocabulary from the preprocessed data
def compute_vocabulary(data):
    vocabulary = set()
    for category, words in data.items():
        for word in words:
            vocabulary.add(word)
    return vocabulary

In [157]:
# Function to compute the prior probabilities
def compute_prior_probabilities(preprocessed_train_data):
    category_counts = dict.fromkeys(preprocessed_train_data.keys(),0)
    for category, words in preprocessed_train_data.items():
        category_counts[category] = len([j for i,j in train_data if j==category])
    num_documents = len(train_data)
    prior_probabilities = {}
    for category in category_counts:
        prior_probabilities[category] = category_counts[category] / num_documents
    return prior_probabilities

In [158]:
# Function to compute the conditional probabilities
def compute_conditional_probabilities(preprocessed_train_data, prior_probabilities, vocabulary):
    lv = len(vocabulary)
    conditional_probabilities = {}
    for category, words in preprocessed_train_data.items():
        total_words = len(words)
        conditional_probabilities[category] = {}
        word_set = set(words)
        for word in word_set:
            conditional_probabilities[category][word] =  prior_probabilities[category]*(words.count(word) + 1) / (lv + total_words)
    return conditional_probabilities

In [159]:
# Function to predict the category of a document
def predict_document_category(words, prior_probabilities, conditional_probabilities):
    probabilities = {}
    for category in prior_probabilities.keys():
        probabilities[category] = math.log(prior_probabilities[category])
        for word in words:
            if word in conditional_probabilities[category]:
                probabilities[category] += math.log(conditional_probabilities[category][word])
    # print(probabilities)
    return min(probabilities, key=probabilities.get)

In [160]:
# Set the path to the data directory
data_dir = "./20_newsgroups"
# print(os.getcwd())

# Stop words set
s = set(stopwords.words('english'))

# Read the data
data = read_data(data_dir)

# Split the data into training and testing datasets
train_data, test_data = split_data(data)
# print(len(train_data), len(test_data))

# Preprocess the training data
preprocessed_train_data = preprocess_data(train_data)

# Compute the vocabulary
vocabulary = compute_vocabulary(preprocessed_train_data)


In [167]:
# Compute the prior probabilities
prior_probabilities = compute_prior_probabilities(preprocessed_train_data)
print(prior_probabilities)

{'talk.politics.misc': 0.049109821964392876, 'comp.windows.x': 0.05091018203640728, 'comp.sys.ibm.pc.hardware': 0.05001000200040008, 'sci.med': 0.04960992198439688, 'rec.motorcycles': 0.04490898179635927, 'rec.sport.hockey': 0.05181036207241448, 'rec.sport.baseball': 0.05251050210042008, 'misc.forsale': 0.04980996199239848, 'comp.graphics': 0.05091018203640728, 'rec.autos': 0.04990998199639928, 'comp.os.ms-windows.misc': 0.05081016203240648, 'talk.religion.misc': 0.04920984196839368, 'comp.sys.mac.hardware': 0.04940988197639528, 'sci.electronics': 0.05151030206041208, 'talk.politics.guns': 0.052410482096419284, 'soc.religion.christian': 0.05001000200040008, 'sci.space': 0.04860972194438888, 'sci.crypt': 0.05211042208441689, 'talk.politics.mideast': 0.04830966193238648, 'alt.atheism': 0.04810962192438488}


In [162]:
# Compute the conditional probabilities
conditional_probabilities = compute_conditional_probabilities(preprocessed_train_data, prior_probabilities, vocabulary)


In [None]:
# for i in conditional_probabilities.keys():
#     print(conditional_probabilities[i])

In [168]:
# Preprocess the testing data
preprocessed_test_data = []
for i in range(len(test_data)):
    preprocessed_test_data.append((test_data[i][1], [word.lower() for word in test_data[i][0].split() if word.isalnum() and word not in s]))

In [170]:
# Compute the accuracy on the testing data
num_correct = 0
for category, words in preprocessed_test_data:
    predicted_category = predict_document_category(words, prior_probabilities, conditional_probabilities)
    # print(category, predicted_category)
    if predicted_category == category:
        num_correct += 1
accuracy = num_correct / len(preprocessed_test_data)
print("Accuracy:{0:.2f}%".format(accuracy*100))

Accuracy:74.36%
