# Section 0: Mount Google Drive

### Note: the notebook will not run unless you have our files in the Google Drive. If you need to run the file, email yilinwan@andrew.cmu.edu

In [1]:
from google.colab import drive
#drive.flush_and_unmount()
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import os
os.chdir("drive/MyDrive/nlp-group")

# Section 2: Error Analysis

In [3]:
import numpy as np 
import pandas as pd
import json as js
import copy as cp

## Load predictions

In [122]:
true = pd.read_csv("goemotions/data/test.tsv", sep='\t', header=None)
true.columns = ["text", "label", "label_string"]
pred_base = pd.read_csv("experiment/Baseline/Baseline/11-23_05-46_test_prediction.tsv",
                        sep = "\t")
pred_new = pd.read_csv("out/seed711/12-05_05-51_test_prediction.tsv",
                        sep = "\t")
emotions = open("goemotions/data/emotions.txt").read().split("\n")

## Helper functions

In [34]:
## compares whether 2 string refers to the same thing
def compare_emotions(emo1, emo2):
  emo1_split = set(emo1.split(","))
  emo2_split = set(emo2.split(","))
  if emo1_split == emo2_split: return True
  return False

def compare_emotions_mult(emo1, emo2):
  emo1_split = emo1.split(",")
  emo2_split = emo2.split(",")
  if len(emo1_split) == 1:
    if emo1_split[0] == emo2_split[0]: return True
    return False
  for emo in emo1_split:
    if emo not in emo2_split: return False
  return True

## find max N element in an set 
def maxN(S, N):
  max_S = max(S)
  if N == 1: 
      return set([max_S])
  S.remove(max_S)
  result = set([max_S])
  result = set.union(result, maxN(S, N-1))
  return result

## takes in a mistake dict and outputs the top N most frequent mistakes
def top_mistakes(D, N):
  counts_dict = dict()
  counts = set()
  for key in D:
    counts_dict[key] = len(D[key])
    counts.add(len(D[key]))
  result_counts_dict = dict()
  result_mistakes_dict = dict()
  topNCounts = maxN(counts, N)
  for key in D:
    if counts_dict[key] in topNCounts:
      result_mistakes_dict[key] = D[key]
      result_counts_dict[key] = counts_dict[key]
  return result_counts_dict, result_mistakes_dict

## given an dataframe, retain the top k values of a row and output a new df
def pred_topK(df, k):
  result = []
  for i in range(len(df)):
    probs = df.iloc[i, :].values
    top_ids = np.argpartition(probs, -k)[-k:]
    probs_top = probs[top_ids]
    top_ids_ordered = top_ids[np.argsort(-probs_top)]
    top_ids_ordered_str = top_ids_ordered.astype(str)
    result.append(','.join(top_ids_ordered_str))
  return result

a = pred_topK(pred_base, 3)


## error analysis

In [123]:
def improved_text(true, pred_base, pred_new, emotions):
  ## process the predicted values to retain top 3 emotions:
  pred_base_emo = pred_topK(pred_base, 3)
  pred_new_emo = pred_topK(pred_new, 3)

  ## retain the error columns 
  pred_base_error_ids = []
  pred_new_error_ids = []
  for i in range(len(true)):
    if not compare_emotions_mult(true.iloc[i, 1], pred_base_emo[i]):
      pred_base_error_ids.append(i)
    if not compare_emotions_mult(true.iloc[i, 1], pred_new_emo[i]):
      pred_new_error_ids.append(i)
  #pred_base_error = true.iloc[pred_base_error_ids, :]
  #pred_new_error = true.iloc[pred_new_error_ids, :]
  unique_base = np.setdiff1d(pred_base_error_ids, pred_new_error_ids)
  return unique_base, true.iloc[unique_base, :]
      
improved_id, improved = improved_text(true, pred_base, pred_new, emotions)

In [124]:
def stat_improved(df, ids, pred_raw):
  result = dict()
  pred = []
  for i in ids:
    pred.append(pred_raw[i])
  for i in range(len(df)):
    mistake_txt = df["text"].values[i]
    true_tmp = frozenset(df["label"].values[i].split(","))
    pred_keys = pred[i].split(",")[0]
    pred_tmp = frozenset(pred_keys[0])
    key_tmp = (true_tmp, pred_tmp)
    #print(pred_tmp)
    if key_tmp not in result:
      result[key_tmp] = []
    result[key_tmp].append(mistake_txt)
  return result
improved_stat = stat_improved(improved, improved_id.astype(int), pred_topK(pred_base, 3))


## Table 2 in report

Most common mistakes by the baseline model that are correctly classified by our model

In [125]:
def print_topk_count(improved_stat, k, emotions):
  count_dict, text_dict = top_mistakes(improved_stat, k)
  for key in count_dict:
    true_label = key[0]
    pred_label = key[1]
    print("True Label: ", end=" ")
    for k in true_label:
      print(emotions[int(k)], end=" ")
    print("  ||  ", end=" ")
    print("Pred label: ", end=" ")
    for k in pred_label:
      print(emotions[int(k)], end=" ")
    print("  ||  ", end=" ")
    print("Count: ", end=" ")
    print(count_dict[key])

print_topk_count(improved_stat, 7, emotions)

True Label:  neutral   ||   Pred label:  approval   ||   Count:  32
True Label:  neutral   ||   Pred label:  amusement   ||   Count:  50
True Label:  neutral   ||   Pred label:  anger   ||   Count:  36
True Label:  neutral   ||   Pred label:  annoyance   ||   Count:  23
True Label:  neutral   ||   Pred label:  admiration   ||   Count:  11
True Label:  neutral   ||   Pred label:  curiosity   ||   Count:  16
True Label:  curiosity   ||   Pred label:  anger   ||   Count:  13
True Label:  approval   ||   Pred label:  anger   ||   Count:  11


## Table 3 of report

Samples that are misclassified by the baseline model and correctly classified by our model

In [126]:
def print_topk_text(improved_stat, k, emotions):
  count_dict, text_dict = top_mistakes(improved_stat, k)
  for key in text_dict:
    true_label = key[0]
    pred_label = key[1]
    print("True Label: ", end=" ")
    for k in true_label:
      print(emotions[int(k)], end=" ")
    print("  ||  ", end=" ")
    print("Pred label: ", end=" ")
    for k in pred_label:
      print(emotions[int(k)], end=" ")
    print("  ||  ", end=" ")
    print("Count: ", end=" ")
    print(count_dict[key])
    for txt in text_dict[key]:
      print(txt)
    print("----------------------------------------------------------------------")

print_topk_text(improved_stat, 7, emotions)

True Label:  neutral   ||   Pred label:  approval   ||   Count:  32
Hey that's a thought! Maybe we need [NAME] to be the celebrity vaccine endorsement!
Well someone posted the ingredients in the comments below. I’m still new to the whole vegan thing.
As a Jeep driver, I constantly have to remind other Jeep drivers that 4WD/AWD only makes a difference when driving. Everyone has 4-wheel stop.
Ok google it then. 🤦🏼‍♀️
The complete series is on Hulu just finished watching it. Definitely recommend
It really do be like that
> Sure it might make the population 4:1 [RELIGION] but it's still an awefully large mibority The population already is 4:1 [RELIGION].
Joining the server when everyone's at 1/4hp and out of ammo, with no damage to your...uh...living room. Yeah, ggwp.
We’re a second half team. We got this. COYG!!
[NAME] follows me on Instagram so it's ok.
Definitely not just on this sub. I keep seeing them on Instagram too.
Now this exactly what I came to this sub for
Was going to say, tha

## Table 1 in report

Result of the LAA model with CB loss, averaged over 10 experiments with different seeds. 

In [120]:
import json as js
with open("out/seed1/output.json") as o1:
  out1 = js.load(o1)
with open("out/seed10/output.json") as o2:
  out2 = js.load(o2)
with open("out/seed20/output.json") as o3:
  out3 = js.load(o3)
with open("out/seed42/output.json") as o4:
  out4 = js.load(o4)
with open("out/seed72/output.json") as o5:
  out5 = js.load(o5)
with open("out/seed711/output.json") as o6:
  out6 = js.load(o6)
with open("out/seed142/output.json") as o7:
  out7 = js.load(o7)
with open("out/seed355/output.json") as o8:
  out8 = js.load(o8)
with open("out/seed151/output.json") as o9:
  out9 = js.load(o9)
with open("out/seed400/output.json") as o10:
  out10 = js.load(o10)
def avg_output(out1, out2, out3, out4, out5, out6,out7, out8, out9, out10, emotions):
  result = dict()
  for key in out1:
    result[key] = (out1[key] + out2[key] + out3[key] + out4[key] + out5[key] + 
                  out6[key] + out7[key] + out8[key] + out9[key] + out10[key]) / 10
    if key == "macro_f1": 
      # print(.1 * (out1[key]**2 + out2[key]**2 + out3[key]**2 + out4[key]**2 + out5[key]**2 + 
      #             out6[key]**2 + out7[key]**2 + out8[key]**2 + out9[key]**2 + out10[key]**2)- result[key]**2)
      a = [out1[key] , out2[key] , out3[key] , out4[key] , out5[key] ,out6[key] , out7[key] , out8[key] , out9[key] , out10[key]]
      print(np.std(a))
  print("macro_f1: ", end=" ")
  print(result["macro_f1"])
  for emo in emotions:
    key = emo + "_f1"
    print(emo, end =": ")
    print("{:.2f}".format(result[key]))

avg_output(out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, emotions)

0.005141221898328572
macro_f1:  0.5197420240793031
admiration: 0.67
amusement: 0.81
anger: 0.49
annoyance: 0.37
approval: 0.41
caring: 0.43
confusion: 0.42
curiosity: 0.56
desire: 0.49
disappointment: 0.33
disapproval: 0.41
disgust: 0.48
embarrassment: 0.45
excitement: 0.41
fear: 0.67
gratitude: 0.90
grief: 0.42
joy: 0.60
love: 0.79
nervousness: 0.34
optimism: 0.56
pride: 0.49
realization: 0.26
relief: 0.36
remorse: 0.68
sadness: 0.53
surprise: 0.54
neutral: 0.67
