# Installs and Imports

In [None]:
#!pip install negate==1.1.3

Collecting stanza
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.9.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.9.2


In [4]:
import numpy as np
import pandas as pd
import json
#from negate import Negator

# Load Data and Models

## Tweet Data

In [5]:
with open('tweet_annotation_sentiment_distance_exp1.json', 'r') as json_file:
    exp1 = json.load(json_file)

with open('tweet_annotation_sentiment_distance_exp2.json', 'r') as json_file:
    exp2 = json.load(json_file)

with open('tweet_annotation_sentiment_distance_exp3.json', 'r') as json_file:
    exp3 = json.load(json_file)

tweet_annotation_sentiment_distance = exp1 + exp3

## Sentence Negator

In [None]:
negator = Negator()

# Brute-Force Algorithm for Threshold

In [6]:
def is_sarc(sentiment_dist, threshold):
  """Returns True if the sentiment distance is greater than a given threshold and False otherwise."""

  return sentiment_dist > threshold

In [7]:
import re

def define_threshold(tweet_annotation_sentiment_distance, threshold_list):
  """
  Creates a list of tuples containing 1) the tweet body, 2) the sarc/non-sarc gold annotation, 3) the sentiment distance between V and NP.
  Then, iterates over a list of thresholds and for each threshold calculates the accuracy between the gold annotations and the predicted values.
  Returns a dictionary of thresholds and accuracies for those thresholds.
  """

  # scaling of the data
  sentiment_max = max([x[2] for x in tweet_annotation_sentiment_distance])
  print(sentiment_max)
  tweet_annotation_sentiment_distance = [(t,a, sentiment_dist/sentiment_max) for t, a, sentiment_dist in tweet_annotation_sentiment_distance]
  print(tweet_annotation_sentiment_distance)

  threshold_accuracy_dict = {}
  for threshold in threshold_list:
    correct = 0
    for _, annotation, sentiment_dist in tweet_annotation_sentiment_distance:
      if is_sarc(sentiment_dist, threshold) == bool(annotation):
        correct += 1

    threshold_accuracy_dict[threshold] = correct / len(tweet_annotation_sentiment_distance)

  return threshold_accuracy_dict, tweet_annotation_sentiment_distance

In [8]:
gap = 0.001
threshold_list = list(np.arange(0, 1 + gap, gap))

threshold_accuracy_dict, tweet_annotation_sentiment_distance = define_threshold(tweet_annotation_sentiment_distance, threshold_list)
best_threshold = max(threshold_accuracy_dict, key=threshold_accuracy_dict.get)
print("Best threshold:", best_threshold)
print("Accuracy:", threshold_accuracy_dict[best_threshold])

1.0
[('i looove getting 3 hours of sleep because two jobs', 1, 0.7442864775657654), ('i hate people who use big words just to make themselves look perspicacious', 1, 0.06481124460697174), ('i love that girl who never liked even my dp', 1, 0.9117235541343689), ('I hate that I wasted my whole weekend 😠', 0, 0.13130223751068115), ('i love being ignored', 1, 0.800491452217102), ('I hate this guy so much https://t.co/lHPKXjNLL4', 0, 0.36199110746383667), ('I hate living in such a cold place', 0, 0.05309780314564705), ('I hate seeing my favourite game company go down the drain.', 0, 0.14841583371162415), ('yes because i loved to be ignored', 1, 0.8455244898796082), ("i'm loving the ceasefire in syria", 1, 0.5935276746749878), ('I hate it when a restaurant ignores my dietary requirements', 0, 0.0770023986697197), ('happy international lesbian day i love loving women 😍', 0, 0.018772365525364876), ('I hate paying so much for gas.', 0, 0.030511749908328056), ('i hate it here ❤️❤️❤️❤️❤️', 1, 0.94

In [9]:
df = pd.DataFrame(columns=['tweet', 'sarcastic'])
df['tweet'] = [x[0] for x in tweet_annotation_sentiment_distance]
df['sarcastic'] = [x[1] for x in tweet_annotation_sentiment_distance]
df

Unnamed: 0,tweet,sarcastic
0,i looove getting 3 hours of sleep because two ...,1
1,i hate people who use big words just to make t...,1
2,i love that girl who never liked even my dp,1
3,I hate that I wasted my whole weekend 😠,0
4,i love being ignored,1
...,...,...
164,I just absolutely LOVE how I've got to work ou...,1
165,I absolutely love it when its pouring down wit...,1
166,i just love everyone posting wholesome pics wi...,0
167,I just love the smell of one million 😍,0


In [10]:
df['prediction'] = None
df['correct'] = None
for i in range(df.shape[0]):
  df.at[i, 'prediction'] = int(is_sarc(tweet_annotation_sentiment_distance[i][2], best_threshold))
  df.at[i, 'correct'] = int(df.at[i, 'prediction'] == df.at[i, 'sarcastic'])

In [11]:
from scipy.stats import binomtest

result = binomtest(sum(df['correct']), df.shape[0], np.mean(df['sarcastic']), alternative='greater')

print(f"p-value: {result.pvalue}")
print(f"Test statistic: {result.statistic}")
print(result)

p-value: 0.009959797536517968
Test statistic: 0.8757396449704142
BinomTestResult(k=148, n=169, alternative='greater', statistic=0.8757396449704142, pvalue=0.009959797536517968)


In [12]:
df.at[df.shape[0]+1, 'correct'] = np.mean(df['correct'])
df.at[df.shape[0], 'sarcastic'] = np.mean(df['sarcastic'])
df.at[df.shape[0]+1, 'correct'] = f"P-value: {round(result.pvalue, 3)}"

In [13]:
df

Unnamed: 0,tweet,sarcastic,prediction,correct
0,i looove getting 3 hours of sleep because two ...,1.000000,1,1
1,i hate people who use big words just to make t...,1.000000,0,0
2,i love that girl who never liked even my dp,1.000000,1,1
3,I hate that I wasted my whole weekend 😠,0.000000,0,1
4,i love being ignored,1.000000,1,1
...,...,...,...,...
166,i just love everyone posting wholesome pics wi...,0.000000,0,1
167,I just love the smell of one million 😍,0.000000,0,1
168,I really love finishing my term with an period...,1.000000,0,0
170,,0.804734,,0.87574


In [None]:
df.to_csv('exp5_results_27112024.csv', index=False)

# Rule-based Sarcasm Interpreter

In [None]:
#Final sarcasm classification function for when we have established the best threshold
def interpret_sarcasm(tweet, verb, noun_phrase, threshold=0.528):
    tweet_sentiment = classify_sentiment(tweet)
    tweet_no_v_sentiment = classify_sentiment(tweet.replace(verb, ""))
    tweet_no_np_sentiment = classify_sentiment(tweet.replace(noun_phrase, ""))
    v_sentiment = tweet_sentiment - tweet_no_v_sentiment
    np_sentiment = tweet_sentiment - tweet_no_np_sentiment

    if np.linalg.norm(v_sentiment - np_sentiment)/6.3885164 > threshold:
      return negator.negate_sentence(tweet)
    else:
      return "The tweet is not sarcastic."

In [None]:
#TEST!!!!!!!!!!!!!!!!!!!!!!!!!!
#Final sarcasm classification function for when we have established the best threshold
def interpret_sarcasm(tweet, verb, noun_phrase, threshold=0.528):
    tweet_sentiment = classify_sentiment(tweet)
    tweet_no_v_sentiment = classify_sentiment(tweet.replace(verb, ""))
    tweet_no_np_sentiment = classify_sentiment(tweet.replace(noun_phrase, ""))
    v_sentiment = tweet_sentiment - tweet_no_v_sentiment
    np_sentiment = tweet_sentiment - tweet_no_np_sentiment

    if np.linalg.norm(v_sentiment - np_sentiment)/6.3885164 > threshold:
      tweet_no_np = tweet.replace(noun_phrase, "")
      negated_part = negator.negate_sentence(tweet_no_np)
      return negated_part + " " + noun_phrase
    else:
      return "The tweet is not sarcastic."