In [None]:
'''
implementation of the Bag Of Words trained classifer from CMU Advanced NLP class from Fall 2024
Actual code: https://github.com/neubig/anlp-code/blob/main/01-simpleclassifier/trained_bow_classifier.ipynb
'''

'\nimplementation of the Bag Of Words trained classifer from CMU Advanced NLP class from Fall 2024\nActual code: https://github.com/neubig/anlp-code/blob/main/01-simpleclassifier/trained_bow_classifier.ipynb\n'

In [None]:
''''
3 step process to build NLP system:
  1. feature extraction function f
  2. score calculation by multiplying feature with weights
  3. prediction function to make prediction based on the score we calculated (different based on binary class or multi-class prediction)
'''

In [None]:
''''
Given dataset from Stanford Three Class, data is arranged in "{groundtruth_score} ||| {sentence}"

1 -> positive
0 -> Neutral
-1 -> Negative

example: "1 ||| I love this movie"
'''

In [None]:
''''
design:
  1. read the data
  2. extract feature
  2*. score calculating function and prediction function
  3. training by generating prediction over # of epochs
  4. run dev set to generate accuracy
  5. error analysis by printing failed cases randomly

'''

In [None]:
#some used libraries
import random
import tqdm

In [None]:
#read the training data file, extract the strings to be trained along with corresponding score
def read_xy_data(filename:str) -> tuple[dict[str], dict[int]]:  #tuple because we are returning 2 things
  x_data=[]   #strings for training
  y_data=[]   #ground truth
  with open(filename, 'r') as f:
    for line in f: #go through every line
      label,text= line.strip().split(" ||| ") #take away white spaces and split the line by the
      x_data.append(text)
      y_data.append(int(label))
  return x_data, y_data

In [None]:
x_data, y_data= read_xy_data("/content/train.txt")
x_test, y_test= read_xy_data("/content/dev.txt")


In [None]:
#function to extract the feature of each word
def extract_feature(str)-> dict[str,float]:   #take in a string and output a feature vector(implemented as dictionary) with the feature word and its corresponding weight
  features= {}
  x_split= str.split(' ') #split the string by space into a list
  for word in x_split:
    features[word]= features.get(word,0)+1   #update the weight of the feature we just got
  return features

In [None]:
#weight vector, default all 0
feature_weight={}

In [None]:
#classifier to calculate scores and make predictions
def run_classifier(features: dict[str, float]) -> int:  #input feature vector, calculate score and output prediction (1,0,or -1)
  score= 0
  for feature_str, feature_value in features.items():  #go through every (key,value) pair in features dictionary. (.items() allow us to go through the specific values)
    score= score+ feature_value* feature_weight.get(feature_str,0)  #multiply the feature's weight with weight vector
  if score >0:
    return 1
  elif score<0:
    return -1
  else:
    return 0

In [None]:
#training with weight adjustments.
  #loop through the training data
EPOCHS= 5
for epoch in range(0, EPOCHS):
  #shuffle all the data
  data_ids= list(range(len(x_data)))   #make a list of numbers based on the length of training data (each number is an index)
  random.shuffle(data_ids)  #shuffle the list of numbers
  for data_id in tqdm.tqdm(data_ids, desc= f"Epoch: {epoch}"):
    x= x_data[data_id]  #access training material
    y=y_data[data_id]   #access groundtruth

    #skip over to not train neutral samples
    if y==0:
      continue

    #extract features
    features= extract_feature(x)
    predict_y= run_classifier(features) #make prediction
    if (predict_y!= y): #update the feature weights
      for word in features:
        feature_weight[word] = feature_weight.get(word,0)+ y*features[word]


Epoch: 0: 100%|██████████| 8544/8544 [00:00<00:00, 47930.48it/s]
Epoch: 1: 100%|██████████| 8544/8544 [00:00<00:00, 46648.21it/s]
Epoch: 2: 100%|██████████| 8544/8544 [00:00<00:00, 52053.73it/s]
Epoch: 3: 100%|██████████| 8544/8544 [00:00<00:00, 51218.91it/s]
Epoch: 4: 100%|██████████| 8544/8544 [00:00<00:00, 53075.24it/s]


In [None]:
#evaluate with test set combined with error analysis
error_id=[]   #indices of the error
correct_pred=0
y_predict=[]
total_pred=0
for i, (x,y) in enumerate(zip(x_test,y_test)):
    features= extract_feature(x)
    predict_y= run_classifier(features)
    y_predict.append(predict_y) #add predictions for later access
    if (predict_y!= y):
      error_id.append(i)
    else:
      correct_pred+=1
    total_pred+=1
print(f"Test Accuracy: {correct_pred/ float(total_pred)}")

Test Accuracy: 0.6039963669391463


In [None]:
#randomly print out failed cases
for num in range(5):
  id= random.choice(error_id)
  print(f"{x_test[id]}\nprediction: {y_predict[id]}\ntrue_label: {y_test[id]}")

His last movie was poetically romantic and full of indelible images , but his latest has nothing going for it .
prediction: 1
true_label: -1
It takes talent to make a lifeless movie about the most heinous man who ever lived .
prediction: 1
true_label: -1
Not for the prurient or squeamish , it 's a daring if overlong examination of an idolized culture , self-loathing and sexual politics .
prediction: 1
true_label: 0
Returning aggressively to his formula of dimwitted comedy and even dimmer characters , Sandler , who also executive produces , has made a film that makes previous vehicles look smart and sassy .
prediction: 1
true_label: 0
If Steven Soderbergh 's ` Solaris ' is a failure it is a glorious failure .
prediction: -1
true_label: 1
