In [1]:
import re
from collections import defaultdict

# Sample data
documents = [
    {"content": "Nvidia GPU is the best in the world.", "class": "computer graphics"},
    {"content": "Nvidia is giving tough competition to AMD.", "class": "computer graphics"},
    {"content": "We were running our application with GTX 1050 (High-end GPU) still it didn't work then we realized the problem was with the OS.", "class": "computer graphics"},
    {"content": "GPU, Ganpat Pandey University, is located in Maharashtra.", "class": "not computer graphics"},
]

test_document = "Please buy GPU from our store."

In [2]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    return words

class_counts = {}
total_docs = len(documents)

for doc in documents:
    if doc['class'] not in class_counts:
        class_counts[doc["class"]] = 0
    class_counts[doc["class"]] += 1

print("class: counts - ",class_counts)

priors = {}
for cls, clscnt in class_counts.items():
    priors[cls] = clscnt/total_docs

print('priors: ',priors)

#now we have the prior probailty of each class



class: counts -  {'computer graphics': 3, 'not computer graphics': 1}
priors:  {'computer graphics': 0.75, 'not computer graphics': 0.25}


In [3]:
word_counts = defaultdict(lambda: defaultdict(int))
total_words_per_class = defaultdict(int) #INITIALIZES  NEW KEY WITH INT DATA TYPE FOR VALUE

for doc in documents:
    words = preprocess(doc["content"])
    cls = doc["class"]

    for word in words:
        word_counts[cls][word]+=1
        total_words_per_class[cls]+=1
    
print("words count: ", word_counts)

likelihoods = defaultdict(lambda: defaultdict(float))
vocab = set(word for cls in word_counts for word in word_counts[cls])
vocab_size = len(vocab)
alpha = 1

for cls in word_counts:
    for word in word_counts[cls]:
        likelihoods[word][cls] = (word_counts[cls][word] + alpha) / (total_words_per_class[cls] + alpha * len(vocab))




words count:  defaultdict(<function <lambda> at 0x000001F96B020B80>, {'computer graphics': defaultdict(<class 'int'>, {'nvidia': 2, 'gpu': 2, 'is': 2, 'the': 4, 'best': 1, 'in': 1, 'world': 1, 'giving': 1, 'tough': 1, 'competition': 1, 'to': 1, 'amd': 1, 'we': 2, 'were': 1, 'running': 1, 'our': 1, 'application': 1, 'with': 2, 'gtx': 1, 'highend': 1, 'still': 1, 'it': 1, 'didnt': 1, 'work': 1, 'then': 1, 'realized': 1, 'problem': 1, 'was': 1, 'os': 1}), 'not computer graphics': defaultdict(<class 'int'>, {'gpu': 1, 'ganpat': 1, 'pandey': 1, 'university': 1, 'is': 1, 'located': 1, 'in': 1, 'maharashtra': 1})})


In [4]:
def predict_class(test_doc):
    words = preprocess(test_doc)
    class_scores = {}

    for cls in priors:
        # Initialize score with the log of the prior probability
        class_scores[cls] = priors[cls] # p (class | sentence) = p(sentence|class) * p(class)

        # Multiply by likelihoods of each word in the test document
        for word in words:
            if word in vocab:
                class_scores[cls] *= likelihoods[cls][word]
            else:
                continue
                # Handle words not seen in training data with minimal impact
                class_scores[cls] *= (1 / (total_words_per_class[cls] + alpha * vocab_size))

    # Choose the class with the highest score
    predicted_class = max(class_scores, key=class_scores.get)
    return predicted_class


predicted_class = predict_class(test_document)
print(f"Predicted class: {predicted_class}")


Predicted class: computer graphics
