In [38]:
import pandas as pd
import re
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
# Load the datasets
train_data = pd.read_csv('train.tsv', sep='\t', header=None, names=["Text", "Label"]).iloc[1:].reset_index(drop=True)
valid_data = pd.read_csv('valid.tsv', sep='\t', header=None, names=["Text", "Label"]).iloc[1:].reset_index(drop=True)

# Preprocess text function
# Import stop words from the NLTK library

# Download stopwords list if not already available
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocess text function with stop word removal
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    # Remove Unicode escape sequences
    text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)
    # Remove words starting with special characters
    text = re.sub(r'\B[@#]\w+', '', text)
    # Tokenize by words and remove non-alphabetic characters
    tokens = re.findall(r'\b[a-z]+\b', text)
    # Remove stop words
    filtered_tokens = []
    for token in tokens:
        if token not in stop_words:
            filtered_tokens.append(token)

    # Generate n-grams (using bigrams as an example here)
    ngrams_tokens = list(ngrams(filtered_tokens, 1))
    
    # Convert n-grams tuples to strings for easier readability
    ngrams_tokens = [' '.join(gram) for gram in ngrams_tokens]
    
    return ngrams_tokens

# Apply preprocessing with stop word removal
train_data['Tokens'] = train_data['Text'].apply(preprocess_text)
valid_data['Tokens'] = valid_data['Text'].apply(preprocess_text)



In [39]:
# unique_tokens_negative = len(word_counts['negative'])
# print(f"Number of unique tokens in 'negative' class: {unique_tokens_negative}")

In [40]:
# Count the number of documents per class
class_counts = train_data['Label'].value_counts()
total_docs = len(train_data)
print('class counts:', class_counts)
print('total docs:', total_docs)
# Calculate class priors P(class)
class_priors = {}
for cls, count in class_counts.items():
    class_priors[cls] = count / total_docs
print('class priors:', class_priors)
# Word counts for each class
word_counts = defaultdict(Counter)
total_words_in_class = defaultdict(int)

# Populate word counts per class
for _, row in train_data.iterrows():
    label = row['Label']
    tokens = row['Tokens']
    word_counts[label].update(tokens)
    total_words_in_class[label] += len(tokens)
print('word counts:', word_counts)
print('total words in class:', total_words_in_class)

# Vocabulary size
vocabulary = set()
for words in word_counts.values():
    for word in words:
        vocabulary.add(word)
vocab_size = len(vocabulary)
print('vocabulary:', vocabulary)
print('vocab size:', vocab_size)

# Calculate word likelihoods with Laplace smoothing
word_likelihoods = defaultdict(dict)
for cls, counts in word_counts.items():
    for word in vocabulary:
        word_likelihoods[cls][word] = (counts[word] + 1) / (total_words_in_class[cls] + vocab_size)
print('word likelihoods:', word_likelihoods)

class counts: Label
negative    613
neutral     613
positive    613
Name: count, dtype: int64
total docs: 1839
class priors: {'negative': 0.3333333333333333, 'neutral': 0.3333333333333333, 'positive': 0.3333333333333333}
total words in class: defaultdict(<class 'int'>, {'negative': 6692, 'neutral': 6419, 'positive': 6606})
vocab size: 6221


In [41]:
# Naive Bayes classification
def classify_naive_bayes(tokens):
    class_scores = {}
    for cls in class_priors:
        # Start with the prior log probability of the class
        score = class_priors[cls]
        for token in tokens:
            # Multiply by word likelihood; add Laplace smoothing in case of missing words
            score *= word_likelihoods[cls].get(token, 1 / (total_words_in_class[cls] + vocab_size))
        class_scores[cls] = score
    
    # Return the class with the highest probability score
    return max(class_scores, key=class_scores.get)

# Classify the validation set
valid_data['Predicted'] = valid_data['Tokens'].apply(classify_naive_bayes)


In [42]:
# Calculate accuracy
accuracy = (valid_data['Predicted'] == valid_data['Label']).mean()
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 57.41%


In [43]:
# Display significant values
for cls in class_priors:
    print(f"Sample Word Likelihoods for '{cls}':")
    top_words = sorted(word_likelihoods[cls].items(), key=lambda x: x[1], reverse=True)[:10]
    for word, prob in top_words:
        print(f"  {word}: {prob:.5f}")
    print()


Sample Word Likelihoods for 'negative':
  may: 0.00991
  tomorrow: 0.00681
  like: 0.00449
  going: 0.00356
  one: 0.00310
  get: 0.00263
  night: 0.00263
  day: 0.00263
  sunday: 0.00256
  go: 0.00256

Sample Word Likelihoods for 'neutral':
  may: 0.00696
  tomorrow: 0.00680
  going: 0.00380
  friday: 0.00356
  go: 0.00293
  sunday: 0.00277
  like: 0.00269
  get: 0.00269
  saturday: 0.00269
  night: 0.00269

Sample Word Likelihoods for 'positive':
  tomorrow: 0.01068
  may: 0.00717
  day: 0.00460
  see: 0.00437
  night: 0.00374
  sunday: 0.00335
  good: 0.00312
  friday: 0.00304
  like: 0.00288
  get: 0.00288

