Sentiment Example from SLP Section 5.1.1

# Importing Libraries


In [223]:
import numpy as np

# Loading the CSV Files

In [224]:
with open('./negative-words.txt') as negative_words_file:
    negative_words = negative_words_file.read().splitlines()

In [225]:
with open('./positive-words.txt') as positive_words_file:
    positive_words = positive_words_file.read().splitlines()

In [226]:
with open('./hotelPosT-train.txt') as positive_reviews_file:
  positive_reviews = positive_reviews_file.read().splitlines()

In [227]:
with open('./hotelNegT-train.txt') as negative_reviews_file:
  negative_reviews = negative_reviews_file.read().splitlines()

## List of Pronouns

In [228]:
pronouns = ["i", "me", "mine", "my", "you", "your", "yours", "we", "us", "ours"]

## Storing in Dictionary

In [229]:
positive_words_dict = dict()
negative_words_dict = dict()
pronouns_dict = dict()


for idx, word in enumerate(positive_words):
  positive_words_dict[word] = idx


for idx, word in enumerate(negative_words):
  negative_words_dict[word] = idx

for idx,word in enumerate(pronouns):
  pronouns_dict[word]=idx;


## Word counter

In [230]:
def compute_counter_feature(document,source_dict):
  counter = 0
  for word in document:
    if word.endswith('!') or word.endswith('.') or word.endswith(','):
      word = word[:-1]
    if word in source_dict:
      counter = counter + 1;
  return counter;

## Is No present?

In [231]:
def is_no_present(document):
  for word in document:
    if word == 'no':
      return 1;
  return 0;    

## Compute the ln of number of words

In [232]:
def compute_ln_number_of_words(document):
  number_of_words = len(document)
  return np.round(np.log(number_of_words),decimals = 2)

## Is ! present ?

In [233]:
def is_exclamation_present(document):
  for words in document:
    if words.endswith('!'):
      return 1;
  return 0;

## Loading Positive and Negative reviews

In [234]:
reviews_positive_dict = dict()
reviews_negative_dict = dict()

for review in positive_reviews:
  list_of_words = review.split()
  lower_case = [word.lower() for word in list_of_words[1:]]
  reviews_positive_dict[list_of_words[0]] = lower_case


for review in negative_reviews:
  list_of_words = review.split()
  lower_case = [word.lower() for word in list_of_words[1:]]
  reviews_negative_dict[list_of_words[0]] = lower_case

In [235]:
def compute_bias_column(dataset, row_len):
  bias = np.repeat(1, row_len)
  bias.shape = (row_len,1)
  dataset = np.hstack((dataset[:,:7], bias, dataset[:,7:]))
  return dataset

In [236]:
def review_array_init(doc_len,columns):
  reviews_array = np.empty((doc_len,columns),dtype=object)
  return reviews_array

In [237]:
def compute_six_features(document):
    feature1 = compute_counter_feature(document,positive_words_dict)
    feature2 = compute_counter_feature(document,negative_words_dict)
    feature3 = is_no_present(document)
    feature4 = compute_counter_feature(document,pronouns_dict)
    feature5 = is_exclamation_present(document)
    feature6 = compute_ln_number_of_words(document)
    return feature1,feature2,feature3,feature4,feature5,feature6

In [238]:
def construct_review_dataset(D,reviews_array,polarity):
  idx = 0
  for doc_id, document in D.items():
    feature1,feature2,feature3,feature4,feature5,feature6 = compute_six_features(document)
    reviews_array[idx][0] = doc_id
    reviews_array[idx][1] = feature1
    reviews_array[idx][2] = feature2
    reviews_array[idx][3] = feature3
    reviews_array[idx][4] = feature4
    reviews_array[idx][5] = feature5
    reviews_array[idx][6] = feature6
    if polarity is not None:
      reviews_array[idx][7] = polarity
    idx+=1
  return reviews_array  

In [239]:
def compute_review_data_set_positive_negative():

  positive_reviews_array = review_array_init(len(reviews_positive_dict),8)
  positive_reviews_array = construct_review_dataset(reviews_positive_dict,positive_reviews_array,1)

  negative_reviews_array = review_array_init(len(reviews_negative_dict),8)
  negative_reviews_array = construct_review_dataset(reviews_negative_dict,negative_reviews_array,0)

  reviews_array = np.concatenate((positive_reviews_array,negative_reviews_array),axis=0)
  return reviews_array

## Class Probability Sigmoid function

In [240]:
def classprob(score): 
  score = 1/(1+np.exp(-score))
  score = np.float128(score) # to handle np.log(0) error increasing the float precision so that it would not become 0 and np.log will have some value
  return score

## Computing the SGD

In [241]:
def compute_sgd(data,learning_rate,bias,epochs,replacement=False):
  weights = np.array([0,0,0,0,0,0,0])
  weights[6] = bias
  rand_indices = []
  for i in range(0,epochs):
    rand_idx = np.random.choice(len(data), size=1, replace=replacement)[0]
    rand_indices.append(rand_idx)
    random_sample = data[rand_idx]
    dot_product = np.dot(weights, random_sample[1:8])
    score = classprob(dot_product)
    correct = data[rand_idx,8]
    gradient = (score - correct)* random_sample[1:8]
    weights = weights - (learning_rate * gradient)
  return weights   

In [242]:
def compute_y_predicted(test_data,weights):
  y_predicted = []
  y_proba_scores = []
  for data in test_data:
    dot_product = np.dot(weights, data[1:8])
    score = classprob(dot_product)
    y_proba_scores.append(score)
    if score>0.5:
      y_predicted.append(1)
    else:
      y_predicted.append(0)
  return y_predicted,y_proba_scores

## Computing accuracy and log loss

In [243]:
def compute_accuracy(test_data,weights):
  y_predicted,y_proba = compute_y_predicted(test_data,weights)
  N = len(test_data)
  error_count = 0
  for idx in range(len(test_data)):
    if test_data[idx][8]!=y_predicted[idx]:
      error_count+=1
  accuracy = (len(y_predicted) - error_count)/len(y_predicted)*100
  accuracy = round(accuracy,2)

  log_loss = 0

  for idx in range(len(test_data)):
    first_term = test_data[idx][8] * np.log(y_proba[idx])
    second_term = (1-test_data[idx][8]) * np.log(1-y_proba[idx])
    log_loss = log_loss + (first_term + second_term)
  
  log_loss = (-1/N)*log_loss
  return accuracy,log_loss

# Train Test data split

In [244]:
def train_test_data_split(reviews_array):
  np.random.shuffle(reviews_array)
  eighty_percent_index = int(len(reviews_array)*0.8)
  train_data = reviews_array[0:eighty_percent_index]
  test_data = reviews_array[eighty_percent_index: len(reviews_array)]
  return train_data,test_data

In [245]:
dataset = compute_review_data_set_positive_negative()
formatter_types = ['%s','%d','%d','%d','%d','%d','%.2f','%d']
np.savetxt("./VenkataramanRavisankar-Ragul-assgn2-part1.csv", dataset, delimiter=",",fmt=formatter_types)

In [246]:
dataset = compute_bias_column(dataset,len(dataset))
train_data, test_data = train_test_data_split(dataset)
weights = compute_sgd(train_data,0.01,0.1,15000,False)

print('Performance of Train Data\n')
train_accuracy,log_loss = compute_accuracy(train_data,weights)

print('The train accuracy is ',train_accuracy,'%')
print(log_loss)


print('\nPerformance of test Data\n')
test_accuracy,log_loss = compute_accuracy(test_data,weights)

print('The test accuracy is ',test_accuracy,'%')
print(log_loss)

Performance of Train Data

The train accuracy is  94.04 %
0.21243003937910974509

Performance of test Data

The test accuracy is  86.84 %
0.7979531551957257609


## Testing the given data file 

In [247]:
with open('./HW2-testset.txt') as reviews_file:
  test_reviews = reviews_file.read().splitlines()

In [248]:
test_reviews_dict = dict()

for review in test_reviews:
  list_of_words = review.split()
  lower_case = [word.lower() for word in list_of_words[1:]]
  test_reviews_dict[list_of_words[0]] = lower_case


test_review_array = review_array_init(len(test_reviews_dict),7)
test_review_array = construct_review_dataset(test_reviews_dict,test_review_array,None)
test_review_array = compute_bias_column(test_review_array, len(test_reviews_dict))

In [249]:
y_predicted_test, y_proba_test = compute_y_predicted(test_review_array,weights)

In [250]:
result = [[review[0], "POS" if y_label == 1 else "NEG"] for review, y_label in zip(test_review_array, y_predicted_test)]
formatter_types = ['%s','%s']
np.savetxt("./VenkataramanRavisankar-Ragul-assgn2-out.txt", result, delimiter="\t",fmt=formatter_types)