In [1]:
import math
from collections import defaultdict
import re
import pandas as pd

In [2]:
training_data = pd.read_csv('train.csv')

In [3]:
training_data.head(2)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [4]:
training_data.drop('textID', inplace=True, axis=1)

In [5]:
training_data.head()

Unnamed: 0,text,selected_text,sentiment
0,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,my boss is bullying me...,bullying me,negative
3,what interview! leave me alone,leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
# Preprocessing: tokenization and cleaning
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()


In [7]:
# Create vocabulary and class frequencies
vocab = set()
class_counts = defaultdict(int)
word_counts = defaultdict(lambda: defaultdict(int))

In [8]:
# Step 1: Populate word and class counts from training data
for _, row in training_data.iterrows():
    sentence = row['text']
    sentiment = row['sentiment']
    
    # Only include "positive" or "negative" sentiments for classification (adjust as needed)
    if sentiment in ['positive', 'negative']:
        words = preprocess_text(sentence)
        class_counts[sentiment] += 1
        for word in words:
            word_counts[sentiment][word] += 1
            vocab.add(word)

In [9]:
# Step 2: Compute prior probabilities
total_sentences = len(training_data)
prior_prob = {label: count / total_sentences for label, count in class_counts.items()}

In [10]:
# Step 3: Compute likelihood with add-1 smoothing
likelihoods = defaultdict(lambda: defaultdict(float))
vocab_size = len(vocab)

In [11]:
for label in class_counts:
    total_words_in_class = sum(word_counts[label].values())
    for word in vocab:
        # Add-1 smoothing
        likelihoods[label][word] = (word_counts[label][word] + 1) / (total_words_in_class + vocab_size)

In [12]:
# Step 4: Define function to classify new sentences
def classify_sentence(sentence):
    words = preprocess_text(sentence)
    posterior_prob = {}
    
    for label in class_counts:
        log_prob = math.log(prior_prob[label])  # Start with log(prior)
        for word in words:
            log_prob += math.log(likelihoods[label].get(word, 1 / (sum(word_counts[label].values()) + vocab_size)))
        posterior_prob[label] = log_prob
    
    # Classify the sentence based on maximum posterior probability
    return max(posterior_prob, key=posterior_prob.get)

In [13]:
predicted_class = classify_sentence("I don't kill this product! but i love this and love more")  # Calls classify_sentence() for classification

In [14]:
predicted_class

'positive'