In [3]:
import re

line = 'A cat sat on the mat. His name was Måns.'

# Initialise lists
tokens = []
unmatchable = []

# Compile patterns for speedup
token_pat = re.compile(r'\w+')
skippable_pat = re.compile(r'\s+')  # typically spaces

# As long as there's any material left...
while line:
    # Try finding a skippable token delimiter first.
    skippable_match = re.search(skippable_pat, line)
    if skippable_match and skippable_match.start() == 0:
        # If there is one at the beginning of the line, just skip it.
        line = line[skippable_match.end():]
    else:
        # Else try finding a real token.
        token_match = re.search(token_pat, line)
        if token_match and token_match.start() == 0:
            # If there is one at the beginning of the line, tokenise it.
            tokens.append(line[:token_match.end()])
            line = line[token_match.end():]
        else:
            # Else there is unmatchable material here.
            # It ends where a skippable or token match starts, or at the end of the line.
            unmatchable_end = len(line)
            if skippable_match:
                unmatchable_end = skippable_match.start()
            if token_match:
                unmatchable_end = min(unmatchable_end, token_match.start())
            # Add it to unmatchable and discard from line.
            unmatchable.append(line[:unmatchable_end])
            line = line[unmatchable_end:]

print(tokens)
print(unmatchable)

['A', 'cat', 'sat', 'on', 'the', 'mat', 'His', 'name', 'was', 'Måns']
['.', '.']


In [5]:
#This is the ideal tokenization
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

sentence = 'Heroin is my passion.'

tknzr.tokenize(sentence)

['Heroin', 'is', 'my', 'passion', '.']

In [20]:
#Recreate
import re
import pathlib

file_path_train = pathlib.Path('datasets/offensive/train_text.txt')
file_path_validation = pathlib.Path('datasets/offensive/val_text.txt')

token_pattern = re.compile(r'\w+')

with open(file_path_train, 'r') as f:
    line = f.readline()
    tokens = []
    corpus_train =[]
    while line:
        line = f.readline()
#         print(line)
#         print("OUR beta tokenizer",re.findall(token_pattern,line))
#         print("DESIRED  tokenizer",tknzr.tokenize(line))
#         print('\n')
        corpus_train.append(tknzr.tokenize(line))
    


with open(file_path_validation, 'r') as f:
    line = f.readline()
    tokens = []
    corpus_val =[]
    while line:
        line = f.readline()
#         print(line)
#         print("OUR beta tokenizer",re.findall(token_pattern,line))
#         print("DESIRED  tokenizer",tknzr.tokenize(line))
#         print('\n')
        corpus_val.append(tknzr.tokenize(line))

In [23]:
from nltk.util import bigrams
#"Neibourghood of a word of size 2"
list(bigrams(corpus_train[0]))

[('@user', 'Eight'),
 ('Eight', 'years'),
 ('years', 'the'),
 ('the', 'republicans'),
 ('republicans', 'denied'),
 ('denied', 'obama'),
 ('obama', '’'),
 ('’', 's'),
 ('s', 'picks'),
 ('picks', '.'),
 ('.', 'Breitbarters'),
 ('Breitbarters', 'outrage'),
 ('outrage', 'is'),
 ('is', 'as'),
 ('as', 'phony'),
 ('phony', 'as'),
 ('as', 'their'),
 ('their', 'fake'),
 ('fake', 'president'),
 ('president', '.')]

In [30]:
from nltk.lm.preprocessing import pad_both_ends
#Now we are not skipping the first word and the last word and we know that they are at the beginning of a sentence and the end of a sentence
list(pad_both_ends(corpus_train[0], n=2))

list(bigrams(pad_both_ends(corpus_train[0], n=2)))

[('<s>', '@user'),
 ('@user', 'Eight'),
 ('Eight', 'years'),
 ('years', 'the'),
 ('the', 'republicans'),
 ('republicans', 'denied'),
 ('denied', 'obama'),
 ('obama', '’'),
 ('’', 's'),
 ('s', 'picks'),
 ('picks', '.'),
 ('.', 'Breitbarters'),
 ('Breitbarters', 'outrage'),
 ('outrage', 'is'),
 ('is', 'as'),
 ('as', 'phony'),
 ('phony', 'as'),
 ('as', 'their'),
 ('their', 'fake'),
 ('fake', 'president'),
 ('president', '.'),
 ('.', '</s>')]

In [162]:
from nltk.util import everygrams
#Know we can do whatever n-gram we want by changing max_len
padded_bigrams = list(pad_both_ends(corpus_train[0], n=2))
list(everygrams(padded_bigrams, max_len=2))

[('<s>',),
 ('<s>', '@user'),
 ('@user',),
 ('@user', 'Eight'),
 ('Eight',),
 ('Eight', 'years'),
 ('years',),
 ('years', 'the'),
 ('the',),
 ('the', 'republicans'),
 ('republicans',),
 ('republicans', 'denied'),
 ('denied',),
 ('denied', 'obama'),
 ('obama',),
 ('obama', '’'),
 ('’',),
 ('’', 's'),
 ('s',),
 ('s', 'picks'),
 ('picks',),
 ('picks', '.'),
 ('.',),
 ('.', 'Breitbarters'),
 ('Breitbarters',),
 ('Breitbarters', 'outrage'),
 ('outrage',),
 ('outrage', 'is'),
 ('is',),
 ('is', 'as'),
 ('as',),
 ('as', 'phony'),
 ('phony',),
 ('phony', 'as'),
 ('as',),
 ('as', 'their'),
 ('their',),
 ('their', 'fake'),
 ('fake',),
 ('fake', 'president'),
 ('president',),
 ('president', '.'),
 ('.',),
 ('.', '</s>'),
 ('</s>',)]

In [164]:
#Create a vocabulary
from nltk.lm.preprocessing import flatten
voc = list(flatten(pad_both_ends(sent, n=2) for sent in corpus_train))

len(voc)

325075

In [62]:
from nltk.lm.preprocessing import padded_everygram_pipeline

train, vocab = padded_everygram_pipeline(2, corpus_train)

#vocab is the same as voc

In [63]:
#Train our model
from nltk.lm import MLE
lm = MLE(2) #2 is the highest n-gram

lm.fit(train,vocab)


In [70]:
lm.vocab.lookup('unjj')
lm.vocab.lookup('the')

'the'

In [71]:
#Count up the ngrams from the training corpus

print(lm.counts)

<NgramCounter with 2 ngram orders and 638234 ngrams>


In [76]:
lm.counts['the']

7268

In [83]:
#How probable are words in certain contexts (relative frequency)

lm.score('the')

0.022357917403676073

In [84]:
#What is the chance that 'the' is preceded by 'have'

lm.score('the',['have'])

0.03650793650793651

In [85]:
#To avoid underflow we take logarithm

lm.logscore("the")

-5.483070379559561

In [93]:
#Generate random words from our vocabulary
lm.generate(5)

['Customs', 'Fees', '.', 'Guess', 'she']

## Task 3 try different models and count n-grams


In [180]:
#2 gram order
train, vocab = padded_everygram_pipeline(2, corpus_train)
lm2 = MLE(2) #2 is the highest n-gram

lm2.fit(train,vocab)
print(lm2.counts)

<NgramCounter with 2 ngram orders and 638234 ngrams>


In [134]:
#3 gram order
train, vocab = padded_everygram_pipeline(3, corpus_train)
lm3 = MLE(3)
lm3.fit(train,vocab)
print(lm3.counts)

<NgramCounter with 3 ngram orders and 1010973 ngrams>


In [137]:
#4 gram order
train, vocab = padded_everygram_pipeline(4, corpus_train)
lm4 = MLE(4)
lm4.fit(train,vocab)
print(lm4.counts)

<NgramCounter with 4 ngram orders and 1419460 ngrams>


## Task 4

In [152]:
lm2.generate(5)

['anybody', 'who', 'gave', 'it', 'was']

In [148]:
lm3.generate(5)

['GUILTY', 'of', 'CRIMINAL', 'BEHAVIOR', '&']

In [146]:
lm4.generate(5)

['that', "doesn't", 'mean', 'we', 'want']

# Task 5

In [179]:
lm2.score('the')

0.022357917403676073

In [154]:
lm3.score('the')

0.02083076579145732

In [155]:
lm4.score('the')

0.01949889869318746

In [183]:
lm2.perplexity([('a', 'b'), ('c', 'd')])

inf

## Annotation agreements

In [52]:
#lukas annotation
rater1=[]
with open('annotation/lukas.txt', 'r') as f:
    line = f.readline()
    while line:
        rater1.append(line.strip())
        line = f.readline()



#thomas annotation
rater2=[]
with open('annotation/thoma.txt', 'r') as f:
    line = f.readline()
    while line:
        rater2.append(line.strip())
        line = f.readline()

#martin annotation
rater3=[]
with open('annotation/martin.txt', 'r') as f:
    line = f.readline()
    while line:
        rater3.append(line.strip())
        line = f.readline()
#yasmine annotation
rater4=[]
with open('annotation/yasmine.txt', 'r') as f:
    line = f.readline()
    while line:
        rater4.append(line.strip().split()[1])
        line = f.readline()
    

        

In [41]:
#len(rater1)
#len(rater2)
#len(rater3)
#len(rater4)

113

In [67]:
from nltk import agreement
rater1 = rater1[0:100]
rater2 = rater2[0:100]
rater3 = rater3[0:100]
rater4 = rater4[0:100]

taskdata=[[0,str(i),str(rater1[i])] for i in range(0,len(rater1))]+[[1,str(i),str(rater2[i])] for i in range(0,len(rater2))]+[[2,str(i),str(rater3[i])] for i in range(0,len(rater3))]#+[[3,str(i),str(rater4[i])] for i in range(0,len(rater4))]


ratingtask = agreement.AnnotationTask(data=taskdata)

#Cohen 1960 Averages naively over kappas for each coder pair.
print("kappa " +str(ratingtask.kappa()))
print("fleiss " + str(ratingtask.multi_kappa()))
print("alpha " +str(ratingtask.alpha()))
print("scotts " + str(ratingtask.pi()))

#Average observed agreement across all coders and items
print('avg Ao', ratingtask.avg_Ao())


kappa 0.43337682467527944
fleiss 0.41626129256428096
alpha 0.39438657407407396
scotts 0.39236111111111127
avg Ao 0.7200000000000001
