# Installs and Imports

In [1]:
!pip install transformers
!pip install torch torchvision torchaudio
!pip install stanza
!pip install negate==1.1.3

Collecting stanza
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.9.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.9.2
Collecting negate==1.1.3
  Downloading negate-1.1.3-py3-none-any.whl.metadata (22 kB)
Collecting lemminflect<0.3.0,>=0.2.3 (from negate==1.1.3)
  Downloading lemminflect-0.2.3-py3-none-any.whl.metadata (7.0 kB)
Downloading negate-1.1.3-py3-none-any.whl (23 kB)
Downloading lemminflect-0.2.3-py3-none-any.whl (769 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m769

In [2]:
import numpy as np
import pandas as pd

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import stanza
from negate import Negator

# Load Data and Models

## Tweet Data

In [None]:
file_name = "exp1.csv"
df = pd.read_csv(file_name)

In [None]:
df

Unnamed: 0,ID,original_ID,tweet,sarcastic,rephrase
0,6440,sign_6941,i looove getting 3 hours of sleep because two ...,1,i hate getting 3 hours of sleep because two jobs
1,12874,sign_12479,i hate people who use big words just to make t...,1,i hate people who use big words just to make t...
2,4295,sign_12125,i love that girl who never liked even my dp,1,i love that girl who never liked even my dp
3,4147,train_2222,I hate that I wasted my whole weekend 😠,0,
4,11437,sign_5017,i love being ignored,1,i hate being ignored
...,...,...,...,...,...
126,5211,train_682,i love 6 hour panic attacks,1,I don't like having 6-hour panic attacks.
127,19299,train_2654,I LOVE LORDE GOODNIGHT,0,
128,7159,train_2825,i LOVE grocery shopping,0,
129,13743,train_2657,i love video games,0,


## Sentiment Classifier

In [3]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def classify_sentiment(text) :
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    # [ negative, neutral, positive ]
    return scores

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
classify_sentiment("love crowded buses")

array([ 0.18265271, -0.2767488 , -0.16453004], dtype=float32)

In [18]:
classify_sentiment("love crowded buses") - classify_sentiment("love")

array([ 1.4201158, -0.4067169, -0.8671002], dtype=float32)

In [19]:
classify_sentiment("crowded buses")

array([ 0.20244329,  1.0098774 , -1.4467493 ], dtype=float32)

In [12]:
classify_sentiment("love")

array([-1.2374631 ,  0.1299681 ,  0.70257014], dtype=float32)

In [None]:
classify_sentiment("I love")

array([-1.7222316 , -0.05902784,  1.4061106 ], dtype=float32)

In [None]:
classify_sentiment("Love")

array([-1.5054053 , -0.03644516,  1.0730698 ], dtype=float32)

In [None]:
classify_sentiment("love")

array([-1.2374631 ,  0.1299681 ,  0.70257014], dtype=float32)

In [None]:
classify_sentiment("I hate")

array([ 0.8649555,  0.05784  , -1.175291 ], dtype=float32)

In [None]:
classify_sentiment("Hate")

array([ 0.21932901,  0.19171844, -0.72959   ], dtype=float32)

In [None]:
classify_sentiment("hate")

array([ 0.71133417,  0.15467213, -1.1714683 ], dtype=float32)

In [None]:
classify_sentiment("Love staying in the office until 10pm")

array([-2.2678285 , -0.38696536,  2.4084647 ], dtype=float32)

In [None]:
classify_sentiment("I love staying in the office until 10pm")

array([-2.326509  , -0.45663574,  2.728802  ], dtype=float32)

In [None]:
classify_sentiment("staying in the office until 10pm")

array([-1.9052361 ,  2.6276646 , -0.68733424], dtype=float32)

In [None]:
classify_sentiment("I love")

array([-1.7222316 , -0.05902784,  1.4061106 ], dtype=float32)

In [None]:
s1 = classify_sentiment("Hate people who moan about EVERY SINGLE THING possible 😴👊🏼")
s1

array([ 2.2053196, -0.4821814, -2.0169976], dtype=float32)

In [None]:
s2 = classify_sentiment("people who moan about EVERY SINGLE THING possible 😴👊🏼")
s2

array([ 2.034265  , -0.19827195, -2.0858595 ], dtype=float32)

In [None]:
s3 = classify_sentiment("Hate")
s3

array([ 0.21932901,  0.19171844, -0.72959   ], dtype=float32)

In [None]:
s1 - s2

array([ 0.1710546 , -0.28390944,  0.06886196], dtype=float32)

In [None]:
s1 - s3

array([ 1.9859906, -0.6738998, -1.2874076], dtype=float32)

In [None]:
np.linalg.norm(s1 - s2)

0.3385354

In [None]:
np.linalg.norm(s1 - s3)

2.460837

In [None]:
s1 = classify_sentiment("Hate people who moan about EVERY SINGLE THING possible 😴👊🏼")
s1

array([ 2.2053196, -0.4821814, -2.0169976], dtype=float32)

In [None]:
s2 = classify_sentiment("people who moan about EVERY SINGLE THING possible 😴👊🏼")
s2

array([ 2.034265  , -0.19827195, -2.0858595 ], dtype=float32)

In [None]:
s3 = classify_sentiment("Hate")
s3

array([ 0.21932901,  0.19171844, -0.72959   ], dtype=float32)

In [None]:
s1 - s2

array([ 0.1710546 , -0.28390944,  0.06886196], dtype=float32)

In [None]:
s1 - s3

array([ 1.9859906, -0.6738998, -1.2874076], dtype=float32)

In [None]:
np.linalg.norm(s1 - s2)

0.3385354

In [None]:
np.linalg.norm(s1 - s3)

2.460837

## Constituency Parser

In [None]:
stanza.download('en')
nlp = stanza.Pipeline('en')



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.9.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage,

In [None]:
def width(t) :
  wds = str(t).replace(")", "").split(" ")
  filt = [wd for wd in wds if '(' not in wd]
  return len( filt )

def add_spans(t, tks) :

  if t.is_preterminal() :
    tk = tks[0]
    t.span = ( tk.start_char , tk.end_char )
    t.tk_info = tk
  else :
    start, end = 0, 0
    for child in t.children :
      w = width(child)
      end = end + w
      add_spans(child, tks[start:end])
      start = end
    t.span = ( t.children[0].span[0] , t.children[-1].span[-1] )


def parse_tree(t):
  doc = nlp(t)
  tree = doc.sentences[0].constituency
  tks_list = []
  for item in doc.sentences[0].tokens :
      tks_list += item.words
  add_spans(tree, tks_list)
  return tree

In [None]:
def membership_check(tree, label):

  if tree.is_preterminal():
    return(tree.label == label)

  if tree.label == label :
    return True

  children = tree.children

  for el in children:

    if (membership_check(el, label)):
      return True

  return False
  # if there is no VP, return

def extract_VP(row):
  # the input will be tree.children

  if not isinstance(row, list):
    if not (membership_check(row, 'VP')):
      return None
    row = [row]

  for idx, el in enumerate(row):
    if el.label == 'VP':
      spc = None
      if idx > 0 and 'ADV' in row[idx - 1].label:
        spc = row[idx - 1]
      return {"VP" : el, "specifier": spc}

  l = []

  for el in row:
    l.extend(el.children)

  return extract_VP(l)



In [None]:
def parse_VP(tree):

  if tree.is_preterminal():

    if 'VB' in tree.label or 'MD' in tree.label or 'RB' in tree.label:
      return [tree], []
    return [], [tree]

  if not tree.label.startswith('V'):
    return [], [tree]

  vbs_list = []
  consts_list = []
  for child in tree.children:
    vbs, consts = parse_VP(child)
    vbs_list += vbs
    consts_list += consts

  return vbs_list, consts_list

In [None]:
def get_text(full, span) :
  if not span :
    return None
  return full[span[0] : span[1]]

def parse_tweet(tw, display=False) :
  tree = parse_tree(tw)

  if display :
    print(tw)
    print(tree)

  res = extract_VP(tree)
  if res :
    spc = res['specifier']
    vbs, consts = parse_VP(res["VP"])

    if 'be' in [vb.tk_info.lemma.lower() for vb in vbs] :
      if display :
        print("Special case : be")
      n_consts = []
      for c in consts :
        if 'ADJ' in c.label :
          vbs.append(c)
        else :
          n_consts.append(c)
      consts = n_consts

    spc_span = get_text(tw, spc.span) if spc else None
    if display :
      print(f"specifier : { spc_span}")
      print(f"verbs : {get_text(tw,  ( vbs[0].span[0], vbs[-1].span[-1] ) )}")
      print("constituents : ")
    for c in consts :
      if display :
        print( get_text(tw, c.span) )
  try :
    return {"text" : tw, "specifier" : spc.span if spc else None, "constituents" : [c.span for c in consts], "verb" : ( vbs[0].span[0], vbs[-1].span[-1] ) }
  except :
    return {"text" : tw, "specifier" : None, "constituents" : None, "verb" : None}


## Sentence Negator

In [None]:
negator = Negator()

# Brute-Force Algorithm for Threshold

In [None]:
def sentiment_distance(tweet, verb, noun_phrase):
  """Calculates the sentiment distance (euclidic distance) between the sentiment scores of the V and NP of a given tweet."""

  tweet_sentiment = classify_sentiment(tweet)
  tweet_no_v_sentiment = classify_sentiment(tweet.replace(verb, "")) ##TO DO: change (ask Samba)
  tweet_no_np_sentiment = classify_sentiment(tweet.replace(noun_phrase, "")) ##TO DO: change (ask Samba)
  v_sentiment = tweet_sentiment - tweet_no_v_sentiment
  np_sentiment = tweet_sentiment - tweet_no_np_sentiment

  return np.linalg.norm(v_sentiment - np_sentiment)

def is_sarc(sentiment_dist, threshold):
  """Returns True if the sentiment distance is greater than a given threshold and False otherwise."""

  return sentiment_dist > threshold

In [None]:
def define_threshold(tweets, gold_annotations, threshold_list):
  """
  Creates a list of tuples containing 1) the tweet body, 2) the sarc/non-sarc gold annotation, 3) the sentiment distance between V and NP.
  Then, iterates over a list of thresholds and for each threshold calculates the accuracy between the gold annotations and the predicted values.
  Returns a dictionary of thresholds and accuracies for those thresholds.
  """

  tweet_annotation_sentiment_distance = []
  for tweet, annotation in zip(tweets, gold_annotations):
    """
    res = parse_tweet(tweet)
    if res["verb"] :
      verb_span = (res["specifier"][0], res["verb"][-1]) if res["specifier"] else res["verb"]
      comp_span = ( min([x[0] for x in res["constituents"]]), max([x[0] for x in res["constituents"]])  )
      tweet_annotation_sentiment_distance.append( (tweet, annotation, sentiment_distance(tweet, get_text(tweet, verb_span), get_text(tweet, comp_span) ) ) )
    else :
    """
    tweet_split = tweet.split()
    try :
      i_idx = [x.lower() for x in tweet_split].index("i")
    except Exception :
      i_idx = min([i for i,x in enumerate(tweet_split) if x.lower().startswith("i'")])
    verb = tweet_split[i_idx + 1]
    noun_phrase = " ".join(tweet_split[i_idx + 2:])
    tweet_annotation_sentiment_distance.append((tweet, annotation, sentiment_distance(tweet, verb, noun_phrase)))

  # scaling of the data
  sentiment_max = max([x[2] for x in tweet_annotation_sentiment_distance])
  print(sentiment_max)
  tweet_annotation_sentiment_distance = [(t,a, sentiment_dist/sentiment_max) for t, a, sentiment_dist in tweet_annotation_sentiment_distance]
  print(tweet_annotation_sentiment_distance)

  threshold_accuracy_dict = {}
  for threshold in threshold_list:
    correct = 0
    for _, annotation, sentiment_dist in tweet_annotation_sentiment_distance:
      if is_sarc(sentiment_dist, threshold) == bool(annotation):
        correct += 1

    threshold_accuracy_dict[threshold] = correct / len(tweet_annotation_sentiment_distance)

  return threshold_accuracy_dict, tweet_annotation_sentiment_distance

In [None]:
gap = 0.001
threshold_list = list(np.arange(0, 1 + gap, gap))

threshold_accuracy_dict, tweet_annotation_sentiment_distance = define_threshold(df["tweet"], df["sarcastic"], threshold_list)
best_threshold = max(threshold_accuracy_dict, key=threshold_accuracy_dict.get)
print("Best threshold:", best_threshold)
print("Accuracy:", threshold_accuracy_dict[best_threshold])

6.987896
[('i looove getting 3 hours of sleep because two jobs', 1, 0.7442865), ('i hate people who use big words just to make themselves look perspicacious', 1, 0.064811245), ('i love that girl who never liked even my dp', 1, 0.91172355), ('I hate that I wasted my whole weekend 😠', 0, 0.13130224), ('i love being ignored', 1, 0.80049145), ('I hate this guy so much https://t.co/lHPKXjNLL4', 0, 0.3619911), ('I hate living in such a cold place', 0, 0.053097803), ('I hate seeing my favourite game company go down the drain.', 0, 0.14841583), ('yes because i loved to be ignored', 1, 0.8455245), ("i'm loving the ceasefire in syria", 1, 0.5935277), ('I hate it when a restaurant ignores my dietary requirements', 0, 0.0770024), ('happy international lesbian day i love loving women 😍', 0, 0.018772366), ('I hate paying so much for gas.', 0, 0.03051175), ('i hate it here ❤️❤️❤️❤️❤️', 1, 0.942194), ('I would hate to be stuck in my room for another lockdown', 0, 0.7133869), ('i love scary movies by m

In [None]:
import json

data_as_lists = [[item[0], item[1], float(item[2])] for item in tweet_annotation_sentiment_distance]

# Save to a JSON file
with open('tweet_annotation_sentiment_distance_exp1.json', 'w') as json_file:
    json.dump(data_as_lists, json_file)

In [None]:
df['prediction'] = None
df['correct'] = None
for i in range(df.shape[0]):
  df.at[i, 'prediction'] = int(is_sarc(tweet_annotation_sentiment_distance[i][2], best_threshold))
  df.at[i, 'correct'] = int(df.at[i, 'prediction'] == df.at[i, 'sarcastic'])

In [None]:
from scipy.stats import binomtest

result = binomtest(sum(df['correct']), df.shape[0], np.mean(df['sarcastic']), alternative='greater')

print(f"p-value: {result.pvalue}")
print(f"Test statistic: {result.statistic}")
print(result)

p-value: 0.030909196366461937
Test statistic: 0.8549618320610687
BinomTestResult(k=112, n=131, alternative='greater', statistic=0.8549618320610687, pvalue=0.030909196366461937)


In [None]:
df.at[df.shape[0]+1, 'correct'] = np.mean(df['correct'])
df.at[df.shape[0], 'sarcastic'] = np.mean(df['sarcastic'])
df.at[df.shape[0]+1, 'correct'] = f"P-value: {round(result.pvalue, 3)}"

In [None]:
df

Unnamed: 0,ID,original_ID,tweet,sarcastic,rephrase,prediction,correct
0,6440.0,sign_6941,i looove getting 3 hours of sleep because two ...,1.00000,i hate getting 3 hours of sleep because two jobs,1,1
1,12874.0,sign_12479,i hate people who use big words just to make t...,1.00000,i hate people who use big words just to make t...,0,0
2,4295.0,sign_12125,i love that girl who never liked even my dp,1.00000,i love that girl who never liked even my dp,1,1
3,4147.0,train_2222,I hate that I wasted my whole weekend 😠,0.00000,,0,1
4,11437.0,sign_5017,i love being ignored,1.00000,i hate being ignored,1,1
...,...,...,...,...,...,...,...
128,7159.0,train_2825,i LOVE grocery shopping,0.00000,,1,0
129,13743.0,train_2657,i love video games,0.00000,,0,1
130,6082.0,train_2408,@OfficialPLT I'd love to get some awesome acti...,0.00000,,0,1
132,,,,0.78626,,,0.854962


In [None]:
df.to_csv('exp1_results_27112024.csv', index=False)

# Test

In [None]:
def sentiment_distance(tweet, verb, noun_phrase):
  """Calculates the sentiment distance (euclidic distance) between the sentiment scores of the V and NP of a given tweet."""

  tweet_sentiment = classify_sentiment(tweet)
  tweet_no_v_sentiment = classify_sentiment(tweet.replace(verb, "")) ##TO DO: change (ask Samba)
  tweet_no_np_sentiment = classify_sentiment(tweet.replace(noun_phrase, "")) ##TO DO: change (ask Samba)
  v_sentiment = tweet_sentiment - tweet_no_v_sentiment
  np_sentiment = tweet_sentiment - tweet_no_np_sentiment

  return v_sentiment, np_sentiment, np.linalg.norm(v_sentiment - np_sentiment)

def is_sarc(sentiment_dist, threshold):
  """Returns True if the sentiment distance is greater than a given threshold and False otherwise."""

  return sentiment_dist > threshold

In [None]:
def define_threshold(tweets, gold_annotations, threshold_list):
  """
  Creates a list of tuples containing 1) the tweet body, 2) the sarc/non-sarc gold annotation, 3) the sentiment distance between V and NP.
  Then, iterates over a list of thresholds and for each threshold calculates the accuracy between the gold annotations and the predicted values.
  Returns a dictionary of thresholds and accuracies for those thresholds.
  """
  verb_sent_np_sent = []
  tweet_annotation_sentiment_distance = []
  for tweet, annotation in zip(tweets, gold_annotations):
    """
    res = parse_tweet(tweet)
    if res["verb"] :
      verb_span = (res["specifier"][0], res["verb"][-1]) if res["specifier"] else res["verb"]
      comp_span = ( min([x[0] for x in res["constituents"]]), max([x[0] for x in res["constituents"]])  )
      tweet_annotation_sentiment_distance.append( (tweet, annotation, sentiment_distance(tweet, get_text(tweet, verb_span), get_text(tweet, comp_span) ) ) )
    else :
    """
    tweet_split = tweet.split()
    try :
      i_idx = [x.lower() for x in tweet_split].index("i")
    except Exception :
      i_idx = min([i for i,x in enumerate(tweet_split) if x.lower().startswith("i'")])
    verb = tweet_split[i_idx + 1]
    noun_phrase = " ".join(tweet_split[i_idx + 2:])
    verb_sent, np_sent, sent_dist = sentiment_distance(tweet, verb, noun_phrase)
    tweet_annotation_sentiment_distance.append((tweet, annotation, sent_dist))
    verb_sent_np_sent.append(((verb, verb_sent), (noun_phrase, np_sent)))

  # scaling of the data
  sentiment_max = max([x[2] for x in tweet_annotation_sentiment_distance])
  print(sentiment_max)
  tweet_annotation_sentiment_distance = [(t,a, sentiment_dist/sentiment_max) for t, a, sentiment_dist in tweet_annotation_sentiment_distance]
  print(tweet_annotation_sentiment_distance)

  threshold_accuracy_dict = {}
  for threshold in threshold_list:
    correct = 0
    for _, annotation, sentiment_dist in tweet_annotation_sentiment_distance:
      if is_sarc(sentiment_dist, threshold) == bool(annotation):
        correct += 1

    threshold_accuracy_dict[threshold] = correct / len(tweet_annotation_sentiment_distance)

  return threshold_accuracy_dict, tweet_annotation_sentiment_distance, verb_sent_np_sent

In [None]:
gap = 0.001
threshold_list = list(np.arange(0, 1 + gap, gap))

threshold_accuracy_dict, tweet_annotation_sentiment_distance, verb_sent_np_sent = define_threshold(df["tweet"], df["sarcastic"], threshold_list)
best_threshold = max(threshold_accuracy_dict, key=threshold_accuracy_dict.get)
print("Best threshold:", best_threshold)
print("Accuracy:", threshold_accuracy_dict[best_threshold])

6.987896
[('i looove getting 3 hours of sleep because two jobs', 1, 0.7442865), ('i hate people who use big words just to make themselves look perspicacious', 1, 0.064811245), ('i love that girl who never liked even my dp', 1, 0.91172355), ('I hate that I wasted my whole weekend 😠', 0, 0.13130224), ('i love being ignored', 1, 0.80049145), ('I hate this guy so much https://t.co/lHPKXjNLL4', 0, 0.3619911), ('I hate living in such a cold place', 0, 0.053097803), ('I hate seeing my favourite game company go down the drain.', 0, 0.14841583), ('yes because i loved to be ignored', 1, 0.8455245), ("i'm loving the ceasefire in syria", 1, 0.5935277), ('I hate it when a restaurant ignores my dietary requirements', 0, 0.0770024), ('happy international lesbian day i love loving women 😍', 0, 0.018772366), ('I hate paying so much for gas.', 0, 0.03051175), ('i hate it here ❤️❤️❤️❤️❤️', 1, 0.942194), ('I would hate to be stuck in my room for another lockdown', 0, 0.7133869), ('i love scary movies by m

In [None]:
verb_sent_np_sent

[(('looove', array([-2.6308231, -1.0677862,  3.918785 ], dtype=float32)),
  ('getting 3 hours of sleep because two jobs',
   array([ 0.22886801,  0.02839899, -0.28489327], dtype=float32))),
 (('hate', array([ 0.86576414, -0.45238686, -0.4018185 ], dtype=float32)),
  ('people who use big words just to make themselves look perspicacious',
   array([ 0.6025964 , -0.12202923, -0.5652803 ], dtype=float32))),
 (('love', array([-0.6063688 , -0.21395265,  0.8223051 ], dtype=float32)),
  ('that girl who never liked even my dp',
   array([ 3.8096228 , -0.07296796, -3.7678094 ], dtype=float32))),
 (('hate', array([ 0.1386416 , -0.22479899,  0.04654026], dtype=float32)),
  ('that I wasted my whole weekend 😠',
   array([ 0.8933282 , -0.58628815, -0.32979524], dtype=float32))),
 (('love', array([-0.32348263, -0.5317513 ,  0.7499713 ], dtype=float32)),
  ('being ignored',
   array([ 3.59578  , -0.3751608, -3.238125 ], dtype=float32))),
 (('hate', array([ 2.3714254 , -0.22858974, -2.2844605 ], dtype=f

# Rule-based Sarcasm Interpreter

In [None]:
#Final sarcasm classification function for when we have established the best threshold
def interpret_sarcasm(tweet, verb, noun_phrase, threshold=0.528):
    tweet_sentiment = classify_sentiment(tweet)
    tweet_no_v_sentiment = classify_sentiment(tweet.replace(verb, ""))
    tweet_no_np_sentiment = classify_sentiment(tweet.replace(noun_phrase, ""))
    v_sentiment = tweet_sentiment - tweet_no_v_sentiment
    np_sentiment = tweet_sentiment - tweet_no_np_sentiment

    if np.linalg.norm(v_sentiment - np_sentiment)/6.3885164 > threshold:
      return negator.negate_sentence(tweet)
    else:
      return "The tweet is not sarcastic."

In [None]:
#TEST!!!!!!!!!!!!!!!!!!!!!!!!!!
#Final sarcasm classification function for when we have established the best threshold
def interpret_sarcasm(tweet, verb, noun_phrase, threshold=0.528):
    tweet_sentiment = classify_sentiment(tweet)
    tweet_no_v_sentiment = classify_sentiment(tweet.replace(verb, ""))
    tweet_no_np_sentiment = classify_sentiment(tweet.replace(noun_phrase, ""))
    v_sentiment = tweet_sentiment - tweet_no_v_sentiment
    np_sentiment = tweet_sentiment - tweet_no_np_sentiment

    if np.linalg.norm(v_sentiment - np_sentiment)/6.3885164 > threshold:
      tweet_no_np = tweet.replace(noun_phrase, "")
      negated_part = negator.negate_sentence(tweet_no_np)
      return negated_part + " " + noun_phrase
    else:
      return "The tweet is not sarcastic."