In [None]:
import regex as re
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from functools import reduce
import difflib
import random
random.seed(10) # Don't change this!

In [None]:
with open("pattern.txt", "r") as f:
    pat = re.compile(f.read())


text_file = "tweeteval/datasets/stance/abortion/train_text.txt"

with open(text_file, "r") as f:
    text = f.read()

corpus = re.findall(pat, text)

In [None]:
def tokenize(pat, text):
    return re.findall(pat, text)


def statistics(corpus):
    counts = Counter(corpus)
    corpus_size = len(corpus)
    vocab_size = len(counts)
    return (counts, corpus_size, vocab_size, vocab_size/corpus_size)

In [None]:
def zipf(corpus):
    voc = Counter(corpus)
    frq = pd.DataFrame(voc.most_common(), columns=['token', 'frequency'])

    # Index in the sorted list
    frq['idx'] = frq.index + 1

    # Frequency normalised by corpus size
    frq['norm_freq'] = frq.frequency / len(corpus)

    # Cumulative normalised frequency
    frq['cumul_frq'] = frq.norm_freq.cumsum()

    sns.set_theme(style='whitegrid')

    # Plot: Cumulative frequency by index
    sns.relplot(x='idx', y='cumul_frq', data=frq)
    plt.show()

    # Plot: Cumulative frequency by index, top 10000 tokens
    sns.relplot(x='idx', y='cumul_frq', data=frq[:10000], kind='line')
    plt.show()

    # Plot: Log-log plot for Zipf's law
    frq['log_frq'] = np.log(frq.frequency)
    frq['log_rank'] = np.log(frq.frequency.rank(ascending=False))
    sns.relplot(x='log_rank', y='log_frq', data=frq)
    plt.show()

In [None]:
# 2 Characterising your data

#• Corpus size, vocabulary size, type/token ratio.

irony_list = []
with open("tweeteval/datasets/irony/test_text.txt", "r") as f:
    irony_text = f.read()

irony_corpus = tokenize(pat, irony_text)
statistics(irony_corpus)
def datfr(corpus):
    voc = Counter(corpus)
    frq = pd.DataFrame(voc.most_common(), columns=['token', 'frequency'])

    # Index in the sorted list
    frq['idx'] = frq.index + 1

    # Frequency normalised by corpus size
    frq['norm_freq'] = frq.frequency / len(corpus)

    # Cumulative normalised frequency
    frq['cumul_frq'] = frq.norm_freq.cumsum()
    sns.set_theme(style='whitegrid')
    return(frq)
datfr(irony_corpus)

In [None]:
%%script echo skipping

stances = ["abortion", "atheism", "climate", "feminist", "hillary"]
stance_texts = []
for stance in stances:
    with open(f"tweeteval/datasets/stance/{stance}/test_text.txt") as f:
        stance_texts.append(f.read())
for text, stance in zip(stance_texts, stances):
    print(stance.title())
    corpus = tokenize(pat, text)
    stats = statistics(corpus)
    print(f"Corpus size: {stats[1]}\tDictionary size: {stats[2]}\tType/token ratio: {stats[3]:.3}")
    print(datfr(corpus).head(10))
    print()
    zipf(corpus)

## 3 Manual Annotation and Inter-Annotator Agreement

In [None]:
random.seed(10)

lines = []
with open("tweeteval/datasets/irony/train_text.txt") as f:
    for line in f:
        lines.append(line.strip())

random_lines_index = sorted(random.sample([i for i in range(0, len(lines))], k = 100))

true_vals = []
with open("tweeteval/datasets/irony/train_labels.txt") as f:
    for line in f:
        true_vals.append(int(line.strip()))

validation = []
for i in random_lines_index:
    validation.append(true_vals[i])

with open("manual_annotation.txt", "w") as f:
    for i in range(100):
        f.write(f"{random_lines_index[i]}@@@ {lines[random_lines_index[i]]}")
        f.write("\n")

$$
    \underbrace{\stackrel{1}{\text{in}}\quad\stackrel{2}{\text{the}}}_{\stackrel{\text{bigram}}{\text{2-gram}}}
    \qquad p(\text{the} \;|\; \underbrace{\phantom{text}\text{in}\phantom{text}}_{\text{firstorder MM}})
$$

In [None]:
from nltk.metrics.agreement import AnnotationTask

df = pd.read_csv("manual_annotations/annotation_results.csv")
df = df.drop(["Unnamed: 0"], axis = 1)
df["Gold"] = validation

In [None]:
tuple_list = []
for name in list(df)[1:]:
    tuples = list(zip([name] * len(df["idx"]), df["idx"], df[name]))
    tuple_list.extend(tuples)
print(tuple_list)

manual = AnnotationTask(data=tuple_list)

In [None]:
print(manual.weighted_kappa())
print(manual.multi_kappa())
print(manual.pi())

In [None]:
manual.kappa_pairwise("Frida", "Viggo")

In [None]:
from statsmodels.stats import inter_rater as irr
import krippendorff as kd

statsmodels_df = df.drop(["idx"], axis = 1)


print("fleiss_alpha",irr.fleiss_kappa(irr.aggregate_raters(statsmodels_df)[0], method = "fleiss"))

statsmodels_trans_df = statsmodels_df.transpose()

print("krippendorff",kd.alpha(statsmodels_trans_df, level_of_measurement='nominal'))

In [None]:
statsmodels_gold_df = statsmodels_df.copy()

statsmodels_gold_df["validation"] = validation

print("fleiss_alpha",irr.fleiss_kappa(irr.aggregate_raters(statsmodels_gold_df)[0], method = "fleiss"))

statsmodels_gold_trans_df = statsmodels_gold_df.transpose()

print("krippendorff",kd.alpha(statsmodels_gold_trans_df, level_of_measurement='nominal'))

In [None]:
names = ["Marie", "Frida", "Magnus", "Viggo", "Gustav"]

In [None]:
tuple_list = []
for name in list(df)[1:]:
    tuples = list(zip([name] * len(df["idx"]), df["idx"], df[name]))
    tuple_list.extend(tuples)

manual_gold = AnnotationTask(data=tuple_list)

In [None]:
from itertools import combinations
from pprint import pprint
import numpy as np
combs = list(map(lambda x: (x[0], x[1], manual_gold.kappa_pairwise(*x)), combinations(names, 2)))
combs.extend(map(lambda x: (x[0], x[1], manual_gold.kappa_pairwise(*x)), zip(names, ["Gold"] * len(names))))
pprint(combs)
print(np.mean(list(map(lambda x: x[2], combs[-5:]))))

In [None]:
for i in range(100):
    if sum(df.iloc[i,1:]) == 3:
        tweetnumber = random_lines_index[i]
        print("idx:",tweetnumber)
        print(f'Tweet: {lines[tweetnumber]}\nLabels: \n{df.iloc[i,1:]} \n')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e4cdc3a5-dd4a-4d72-a71a-972cea883107' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>