In [1]:
import numpy as np 
import pandas as pd
import os 
from nltk.tokenize import word_tokenize 
from nltk import bigrams
from sklearn.model_selection import train_test_split

In [2]:
file_path_sst = r'D:\IIT(지난학기)\03_2023_Fall\Natural Language Processing\Homework\HW03\Data\SST-2'
full_path = os.path.join(file_path_sst, "train.tsv")

df_sst = pd.read_csv(full_path, delimiter='\t')

df_train, df_temp = train_test_split(df_sst, test_size=0.4, random_state=42)
df_validation, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)


positive_prob = df_train['label'].mean()
negative_prob = 1 - positive_prob

print(f"Prior probability of positive sentiment (label=1): {positive_prob:.4f}")
print(f"Prior probability of negative sentiment (label=0): {negative_prob:.4f}")

Prior probability of positive sentiment (label=1): 0.5550
Prior probability of negative sentiment (label=0): 0.4450


In [3]:
def tokenize_sentence(sentence):
    tokens = word_tokenize(sentence)
    return ['<s>'] + tokens + ['</s>']

df_train = df_train.copy()
df_train['tokenized'] = df_train['sentence'].apply(tokenize_sentence)
print("Tokenization of the first sentence in the training set:", df_train['tokenized'].iloc[0])

vocabulary = set()
for tokens in df_train['tokenized']:
    vocabulary.update(tokens) 
vocabulary_size = len(vocabulary)
print("Vocabulary size of the training set:", vocabulary_size) 


Tokenization of the first sentence in the training set: ['<s>', 'indulges', 'in', 'the', 'worst', 'elements', 'of', 'all', 'of', 'them', '.', '</s>']
Vocabulary size of the training set: 14383


In [4]:
def count_bigrams(tokenized_sequences):
    bigram_counts = {}
    for sequence in tokenized_sequences:
        for w1, w2 in bigrams(sequence):
            if w1 not in bigram_counts:
                bigram_counts[w1] = {}
            bigram_counts[w1][w2] = bigram_counts[w1].get(w2, 0) + 1
    return bigram_counts

bigram_counts = count_bigrams(df_train['tokenized'])

start_with_the_count = bigram_counts['<s>'].get('the', 0)
print(f"The bigram ('<s>', 'the') appears {start_with_the_count} times in the training set.")

The bigram ('<s>', 'the') appears 2599 times in the training set.


In [5]:
def smoothed_probability(w2, w1, bigram_counts, alpha, vocab_size):
    if w1 not in bigram_counts:
        return alpha / vocab_size
    bigram_count = bigram_counts[w1].get(w2, 0)
    unigram_count = sum(bigram_counts[w1].values())
    prob = (bigram_count + alpha) / (unigram_count + vocab_size * alpha)
    return prob

alpha_values = [0.001, 0.5]
for alpha in alpha_values:
    prob = smoothed_probability("award", "academy", bigram_counts, alpha, vocabulary_size)
    log_prob = np.log(prob)
    print(f"Log probability of 'award' following 'academy' with alpha={alpha}: {log_prob:.4f}")

Log probability of 'award' following 'academy' with alpha=0.001: -1.0526
Log probability of 'award' following 'academy' with alpha=0.5: -6.3577


In [6]:
def sentence_log_probability(sentence, bigram_counts, alpha, vocab_size):
    tokens = ['<s>'] + word_tokenize(sentence) + ['</s>']
    log_prob = 0
    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i+1]
        prob = smoothed_probability(w2, w1, bigram_counts, alpha, vocab_size)
        log_prob += np.log(prob)
    return log_prob

sentences = [
    "this was a really great movie but it was a little too long.",
    "long too little a was it but movie great really a was this."
]

alpha = 0.5
for sentence in sentences:
    log_prob = sentence_log_probability(sentence, bigram_counts, alpha, vocabulary_size)
    print(f"Log probability of the sentence '{sentence}': {log_prob:.4f}")

Log probability of the sentence 'this was a really great movie but it was a little too long.': -88.7084
Log probability of the sentence 'long too little a was it but movie great really a was this.': -124.9414


In [7]:
alpha_values = [0.001, 0.01, 0.1, 0.05, 0.005]
log_likelihoods = {}

for alpha in alpha_values:
    total_log_prob = sum(sentence_log_probability(sentence, bigram_counts, alpha, vocabulary_size) for sentence in df_validation['sentence'])
    log_likelihoods[alpha] = total_log_prob

for alpha, log_likelihood in log_likelihoods.items():
    print(f"Log likelihood estimate of the validation set for alpha={alpha}: {log_likelihood:.4f}")

selected_alpha = max(log_likelihoods, key=log_likelihoods.get)
print(f"\nThe selected alpha value is: {selected_alpha}")

Log likelihood estimate of the validation set for alpha=0.001: -605525.6633
Log likelihood estimate of the validation set for alpha=0.01: -675013.5222
Log likelihood estimate of the validation set for alpha=0.1: -809397.7779
Log likelihood estimate of the validation set for alpha=0.05: -762345.2684
Log likelihood estimate of the validation set for alpha=0.005: -647333.4097

The selected alpha value is: 0.001


In [8]:
df_positive = df_train[df_train['label'] == 1]
df_negative = df_train[df_train['label'] == 0]

bigram_counts_positive = count_bigrams(df_positive['tokenized'])
bigram_counts_negative = count_bigrams(df_negative['tokenized'])

vocab_size_positive = len(set(token for sequences in df_positive['tokenized'] for token in sequences))
vocab_size_negative = len(set(token for sequences in df_negative['tokenized'] for token in sequences))

predicted_labels = []
for sentence in df_test['sentence']:
    positive_score = sentence_log_probability(sentence, bigram_counts_positive, selected_alpha, vocab_size_positive) + np.log(positive_prob)
    negative_score = sentence_log_probability(sentence, bigram_counts_negative, selected_alpha, vocab_size_negative) + np.log(negative_prob) 
    
    predicted_label = 1 if positive_score > negative_score else 0 
    predicted_labels.append(predicted_label) 

positive_predictions = sum(predicted_labels) 
negative_predictions = len(predicted_labels) - positive_predictions 

print(f"Number of positive sentiment predictions: {positive_predictions}") 
print(f"Number of negative sentiment predictions: {negative_predictions}") 

accuracy = sum(true == pred for true, pred in zip(df_test['label'], predicted_labels)) / len(df_test) 
print(f"\nAccuracy of the sentiment prediction on the test set: {accuracy:.4f}") 

Number of positive sentiment predictions: 7348
Number of negative sentiment predictions: 6122

Accuracy of the sentiment prediction on the test set: 0.8808
