In [1]:
import regex as re
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from functools import reduce
import difflib
import random
random.seed(10) # Don't change this!

In [2]:
with open("pattern.txt", "r") as f:
    pat = re.compile(f.read())

text_file = "tweeteval/datasets/stance/abortion/train_text.txt"

def convert_links(string):
    pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www\.\w+\.\w{3}"
    return re.sub(pattern, '@link', string)

with open(text_file, "r") as f:
    text = convert_links(f.read().lower())

corpus = re.findall(pat, text)

In [3]:
def tokenize(pat, text):
    return re.findall(pat, text)

def convert_links(string):
    pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www\.\w+\.\w{3}"
    return re.sub(pattern, '@link', string)

def statistics(corpus):
    counts = Counter(corpus)
    corpus_size = len(corpus)
    vocab_size = len(counts)
    return (counts, corpus_size, vocab_size, vocab_size/corpus_size)

In [4]:
def zipf(corpus):
    voc = Counter(corpus)
    frq = pd.DataFrame(voc.most_common(), columns=['token', 'frequency'])

    # Index in the sorted list
    frq['idx'] = frq.index + 1

    # Frequency normalised by corpus size
    frq['norm_freq'] = frq.frequency / len(corpus)

    # Cumulative normalised frequency
    frq['cumul_frq'] = frq.norm_freq.cumsum()

    sns.set_theme(style='whitegrid')

    # Plot: Cumulative frequency by index
    sns.relplot(x='idx', y='cumul_frq', data=frq)
    plt.show()

    # Plot: Cumulative frequency by index, top 10000 tokens
    sns.relplot(x='idx', y='cumul_frq', data=frq[:10000], kind='line')
    plt.show()

    # Plot: Log-log plot for Zipf's law
    frq['log_frq'] = np.log(frq.frequency)
    frq['log_rank'] = np.log(frq.frequency.rank(ascending=False))
    sns.relplot(x='log_rank', y='log_frq', data=frq)
    plt.show()

# 2 Characterising your data

In [5]:
# 
 
#• Corpus size, vocabulary size, type/token ratio.

irony_list = []
with open("tweeteval/datasets/irony/test_text.txt", "r") as f:
    irony_text = convert_links(f.read().lower())

irony_corpus = tokenize(pat, irony_text)

statistics(irony_corpus)
def datfr(corpus):
    voc = Counter(corpus)
    frq = pd.DataFrame(voc.most_common(), columns=['token', 'frequency'])

    # Index in the sorted list
    frq['idx'] = frq.index + 1

    # Frequency normalised by corpus size
    frq['norm_freq'] = frq.frequency / len(corpus)

    # Cumulative normalised frequency
    frq['cumul_frq'] = frq.norm_freq.cumsum()
    sns.set_theme(style='whitegrid')
    return(frq)
datfr(irony_corpus)

Unnamed: 0,token,frequency,idx,norm_freq,cumul_frq
0,@user,465,1,0.039815,0.039815
1,the,334,2,0.028598,0.068413
2,to,250,3,0.021406,0.089819
3,a,230,4,0.019693,0.109513
4,#not,211,5,0.018067,0.127579
...,...,...,...,...,...
3787,📚,1,3788,0.000086,0.999658
3788,📖,1,3789,0.000086,0.999743
3789,#stop,1,3790,0.000086,0.999829
3790,labeling,1,3791,0.000086,0.999914


In [6]:
import contractions
expanded_words = []   
for word in irony_corpus:
  # using contractions.fix to expand the shortened words
  expanded_words.append(contractions.fix(word))  
irony_corpus = expanded_words

In [7]:
print(irony_text.find("@link"))

12023


In [8]:
%%script echo skipping

stances = ["abortion", "atheism", "climate", "feminist", "hillary"]
stance_texts = []
for stance in stances:
    with open(f"tweeteval/datasets/stance/{stance}/test_text.txt") as f:
        stance_texts.append(f.read())
for text, stance in zip(stance_texts, stances):
    print(stance.title())
    corpus = tokenize(pat, text)
    stats = statistics(corpus)
    print(f"Corpus size: {stats[1]}\tDictionary size: {stats[2]}\tType/token ratio: {stats[3]:.3}")
    print(datfr(corpus).head(10))
    print()
    zipf(corpus)

skipping


## 3 Manual Annotation and Inter-Annotator Agreement

In [9]:
random.seed(10)

lines = []
with open("tweeteval/datasets/irony/train_text.txt") as f:
    for line in f:
        lines.append(line.strip())

random_lines_index = sorted(random.sample([i for i in range(0, len(lines))], k = 100))

true_vals = []
with open("tweeteval/datasets/irony/train_labels.txt") as f:
    for line in f:
        true_vals.append(int(line.strip()))

validation = []
for i in random_lines_index:
    validation.append(true_vals[i])

with open("manual_annotation.txt", "w") as f:
    for i in range(100):
        f.write(f"{random_lines_index[i]}@@@ {lines[random_lines_index[i]]}")
        f.write("\n")

$$
    \underbrace{\stackrel{1}{\text{in}}\quad\stackrel{2}{\text{the}}}_{\stackrel{\text{bigram}}{\text{2-gram}}}
    \qquad p(\text{the} \;|\; \underbrace{\phantom{text}\text{in}\phantom{text}}_{\text{firstorder MM}})
$$

In [10]:
from nltk.metrics.agreement import AnnotationTask

df = pd.read_csv("manual_annotations/annotation_results.csv")
df = df.drop(["Unnamed: 0"], axis = 1)
df["Gold"] = validation

In [11]:
tuple_list = []
for name in list(df)[1:]:
    tuples = list(zip([name] * len(df["idx"]), df["idx"], df[name]))
    tuple_list.extend(tuples)
print(tuple_list)

manual = AnnotationTask(data=tuple_list)

[('Marie', 16, 1), ('Marie', 60, 0), ('Marie', 130, 1), ('Marie', 133, 0), ('Marie', 140, 0), ('Marie', 151, 1), ('Marie', 181, 1), ('Marie', 182, 1), ('Marie', 266, 1), ('Marie', 295, 1), ('Marie', 311, 1), ('Marie', 331, 1), ('Marie', 395, 0), ('Marie', 462, 0), ('Marie', 489, 1), ('Marie', 544, 1), ('Marie', 548, 1), ('Marie', 568, 0), ('Marie', 587, 0), ('Marie', 613, 0), ('Marie', 640, 0), ('Marie', 656, 0), ('Marie', 675, 1), ('Marie', 681, 1), ('Marie', 715, 0), ('Marie', 730, 1), ('Marie', 782, 1), ('Marie', 798, 1), ('Marie', 844, 0), ('Marie', 918, 0), ('Marie', 965, 1), ('Marie', 976, 1), ('Marie', 980, 1), ('Marie', 983, 1), ('Marie', 1005, 1), ('Marie', 1023, 0), ('Marie', 1074, 1), ('Marie', 1093, 1), ('Marie', 1125, 0), ('Marie', 1136, 0), ('Marie', 1161, 0), ('Marie', 1233, 1), ('Marie', 1239, 1), ('Marie', 1240, 1), ('Marie', 1241, 0), ('Marie', 1287, 1), ('Marie', 1329, 1), ('Marie', 1342, 0), ('Marie', 1428, 0), ('Marie', 1454, 0), ('Marie', 1479, 0), ('Marie', 1485,

In [12]:
print(manual.weighted_kappa())
print(manual.multi_kappa())
print(manual.pi())

0.466084504887111
0.4639075776663895
0.46127946127946096


In [13]:
manual.kappa_pairwise("Frida", "Viggo")

0.6061359867330018

In [14]:
from statsmodels.stats import inter_rater as irr
import krippendorff as kd

statsmodels_df = df.drop(["idx"], axis = 1)
print("fleiss_alpha",irr.fleiss_kappa(irr.aggregate_raters(statsmodels_df)[0], method = "fleiss"))

statsmodels_trans_df = statsmodels_df.transpose()
print("krippendorff",kd.alpha(statsmodels_trans_df, level_of_measurement='nominal'))

fleiss_alpha 0.4612794612794613
krippendorff 0.46217732884399554


In [15]:
statsmodels_gold_df = statsmodels_df.copy()
statsmodels_gold_df["validation"] = validation
print("fleiss_alpha",irr.fleiss_kappa(irr.aggregate_raters(statsmodels_gold_df)[0], method = "fleiss"))

statsmodels_gold_trans_df = statsmodels_gold_df.transpose()
print("krippendorff",kd.alpha(statsmodels_gold_trans_df, level_of_measurement='nominal'))

fleiss_alpha 0.48623571811977573
krippendorff 0.48696966709389067


In [16]:
names = ["Marie", "Frida", "Magnus", "Viggo", "Gustav"]

tuple_list = []
for name in list(df)[1:]:
    tuples = list(zip([name] * len(df["idx"]), df["idx"], df[name]))
    tuple_list.extend(tuples)

manual_gold = AnnotationTask(data=tuple_list)

In [17]:
from itertools import combinations
from IPython.display import Markdown as md
from tabulate import tabulate
import numpy as np

combs = list(map(lambda x: (x[0], x[1], manual_gold.kappa_pairwise(*x)), combinations(names, 2)))
combs.extend(map(lambda x: (x[0], x[1], manual_gold.kappa_pairwise(*x)), zip(names, ["Gold"] * len(names))))
table = tabulate(combs, headers=["Annotator 1", "Annotator 2", "Kappa Pairwise"], tablefmt="github", floatfmt=".3f")
table += f"\n### Average Agreement with Gold: {np.mean(list(map(lambda x: x[2], combs[-5:]))):.3f}"
md(table)

| Annotator 1   | Annotator 2   |   Kappa Pairwise |
|---------------|---------------|------------------|
| Marie         | Frida         |            0.376 |
| Marie         | Magnus        |            0.408 |
| Marie         | Viggo         |            0.362 |
| Marie         | Gustav        |            0.333 |
| Frida         | Magnus        |            0.601 |
| Frida         | Viggo         |            0.606 |
| Frida         | Gustav        |            0.470 |
| Magnus        | Viggo         |            0.615 |
| Magnus        | Gustav        |            0.440 |
| Viggo         | Gustav        |            0.486 |
| Marie         | Gold          |            0.517 |
| Frida         | Gold          |            0.364 |
| Magnus        | Gold          |            0.545 |
| Viggo         | Gold          |            0.465 |
| Gustav        | Gold          |            0.403 |
### Average Agreement with Gold: 0.459

In [18]:
print(tabulate(combs, headers=["Annotator 1", "Annotator 2", "Kappa Pairwise"], tablefmt="html", floatfmt=".3f"))

<table>
<thead>
<tr><th>Annotator 1  </th><th>Annotator 2  </th><th style="text-align: right;">  Kappa Pairwise</th></tr>
</thead>
<tbody>
<tr><td>Marie        </td><td>Frida        </td><td style="text-align: right;">           0.376</td></tr>
<tr><td>Marie        </td><td>Magnus       </td><td style="text-align: right;">           0.408</td></tr>
<tr><td>Marie        </td><td>Viggo        </td><td style="text-align: right;">           0.362</td></tr>
<tr><td>Marie        </td><td>Gustav       </td><td style="text-align: right;">           0.333</td></tr>
<tr><td>Frida        </td><td>Magnus       </td><td style="text-align: right;">           0.601</td></tr>
<tr><td>Frida        </td><td>Viggo        </td><td style="text-align: right;">           0.606</td></tr>
<tr><td>Frida        </td><td>Gustav       </td><td style="text-align: right;">           0.470</td></tr>
<tr><td>Magnus       </td><td>Viggo        </td><td style="text-align: right;">           0.615</td></tr>
<tr><td>Magnu

In [19]:
for i in range(100):
    if sum(df.iloc[i,1:]) == 3:
        tweetnumber = random_lines_index[i]
        print("idx:",tweetnumber)
        print(f'Tweet: {lines[tweetnumber]}\nLabels: \n{df.iloc[i,1:]} \n')

idx: 266
Tweet: @user ha yeah which is your opinion, which like mine, means nothing ;) they're in.
Labels: 
Marie     1
Frida     1
Magnus    0
Gustav    0
Viggo     1
Gold      0
Name: 8, dtype: int64 

idx: 782
Tweet: oh lord!  RT @user RT @user Before becoming an actor, Tom Cruise wanted to be a Catholic priest.
Labels: 
Marie     1
Frida     0
Magnus    0
Gustav    0
Viggo     1
Gold      1
Name: 26, dtype: int64 

idx: 844
Tweet: @user (281): If I had feelings, you would have hurt them.
Labels: 
Marie     0
Frida     1
Magnus    1
Gustav    0
Viggo     0
Gold      1
Name: 28, dtype: int64 

idx: 980
Tweet: Carbon everywhere :) #carbon #hood  #sticker #strips #down #red #automotive #dope #cars...
Labels: 
Marie     1
Frida     1
Magnus    0
Gustav    1
Viggo     0
Gold      0
Name: 32, dtype: int64 

idx: 1093
Tweet: #notcies #eu Juncker receives birthday surprise: One million signatures opposing TTIP and CETA
Labels: 
Marie     1
Frida     1
Magnus    0
Gustav    1
Viggo     0
Gol

In [20]:
df

Unnamed: 0,idx,Marie,Frida,Magnus,Gustav,Viggo,Gold
0,16,1,1,1,1,1,1
1,60,0,1,0,0,0,0
2,130,1,0,0,0,0,1
3,133,0,0,0,0,0,0
4,140,0,0,0,1,0,0
...,...,...,...,...,...,...,...
95,2710,1,1,1,1,1,1
96,2730,0,1,0,0,1,0
97,2764,0,0,0,0,0,0
98,2788,0,1,0,1,0,0


In [21]:
sum(true_vals)/len(true_vals)

0.5048916841369672

In [22]:
len(true_vals)

2862

# 4 Automatic Prediction

In [23]:
train_labels = "/work/fyp2022g03GroupB2/tweeteval/datasets/irony/train_labels.txt"
irony_tweets_path = "/work/fyp2022g03GroupB2/tweeteval/datasets/irony/train_text.txt"
Y = np.loadtxt(train_labels)


In [31]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [30]:
#clf = make_pipeline(StandardScaler(), SGDClassifier(loss="log"))

#clf.fit(corpus,Y)

#X = CountVectorizer.fit_transform(xxxcorpus)
#print(X.toarray())

In [34]:
def preprocess(text):
    tweets = np.array(text.split("\n"))
    for i, tweet in enumerate(tweets):
        tweet = " ".join([contractions.fix(word.lower()) for word in tweet.split(" ")])
        tweets[i] = tweet
    return tweets

In [45]:
with open("pattern.txt", "r") as f:
    pat = f.read().rstrip()
clf = make_pipeline(CountVectorizer(token_pattern=pat), TfidfTransformer(), SGDClassifier(loss="log"))

In [46]:
with open("tweeteval/datasets/irony/train_text.txt", "r") as f:
    irony_text = f.read().rstrip()
train_labels = np.loadtxt("tweeteval/datasets/irony/train_labels.txt")
clf.fit(preprocess(irony_text), train_labels)

error: bad escape \p at position 305

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e4cdc3a5-dd4a-4d72-a71a-972cea883107' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>