In [None]:
'''
Evaluate tagged corpus on 100 topics and some statistics 
'''

In [1]:
%%capture
!pip install ijson
!pip install python-terrier
import pyterrier as pt
if not pt.started():
  pt.init()

import pandas as pd
df = pd.read_csv('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-hollatz/tag-essays-fasttext.csv')


In [2]:
from xml.etree import ElementTree
tree = ElementTree.parse('/workspace/src/evaluate_targers/topics2020.xml')
data = tree.getroot()
titles = []
title_ids = []
nums = []
for topic in data:
    titles.append(topic.find('title').text.lower()[:-1])
    num = topic.find('number').text
    nums.append(num)
tree = ElementTree.parse('/workspace/src/evaluate_targers/topics.xml')
data = tree.getroot()    
for topic in data:
    titles.append(topic.find('title').text.lower()[:-1])
    num = topic.find('number').text
    nums.append(num)
topics = pd.DataFrame()
topics['qid'] = nums
topics['query'] = titles


In [6]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

qrels_path=("/workspace/src/evaluate_targers/qrels.qrels")
qrels = pt.io.read_qrels(qrels_path)

docnos = qrels.docno

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

df = df.fillna('')
df['tagged'] = df['claims'] + " " + df['premises']

path_targer = '/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-hollatz/targer_indexes/'
pd_indexer = pt.DFIndexer(path_targer + "pd_index_essays-fasttext-100")
indexref = pd_indexer.index(df["tagged"], df["docno"])

index = pt.IndexFactory.of(indexref)
retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"})
res=retr.transform(topics)

res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,88639,Sb0680508-Aa5189771,0,17.008380,should teachers get tenure
1,1,129716,S51530f3f-A4715d76f,1,16.965211,should teachers get tenure
2,1,309456,Sc065954f-A24a16870,2,16.883360,should teachers get tenure
3,1,309455,Sc065954f-A6deb09b6,3,16.851221,should teachers get tenure
4,1,309458,Sc065954f-Ae72bc9c6,4,16.778711,should teachers get tenure
...,...,...,...,...,...,...
98995,100,30720,Sfe8a2184-A8360f8e6,995,3.177694,do we need cash
98996,100,54528,Sf93b654a-Ab8b58361,996,3.177694,do we need cash
98997,100,196992,Sc198d90c-A678ac134,997,3.177694,do we need cash
98998,100,39923,Sb4d87f99-A9fc8f8f5,998,3.175548,do we need cash


In [7]:
eval = pt.Utils.evaluate(res,qrels,metrics=["ndcg_cut_5", "bpref", "ndcg_cut_25"])
eval

{'ndcg_cut_5': 0.49772315645637155,
 'bpref': 0.6129421067550933,
 'ndcg_cut_25': 0.41942291837109175}

In [8]:
res_copy = res.copy()
res_copy = res_copy[res_copy['docno'].isin(docnos)]
res_copy.head(10)

eval = pt.Utils.evaluate(res_copy,qrels,metrics=["ndcg_cut_5", "bpref", "ndcg_cut_25"])
eval

{'ndcg_cut_5': 0.601382665412091,
 'bpref': 0.6129421067550933,
 'ndcg_cut_25': 0.6354244348568833}

In [10]:
import numpy as np
tagged_df = pd.read_csv('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-hollatz/tag-essays-fasttext.csv')
tagged_df.replace(np.nan, '', inplace = True)
df = pd.read_csv('/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-hollatz/corpus.csv')

In [11]:
all_claims = tagged_df['claims']
all_premises = tagged_df['premises']
text_claim_and_premise = []
for claim, premise in zip(all_claims, all_premises):
    text_claim_and_premise.append(claim + " " + premise)

In [12]:
df_claim_and_premise = pd.DataFrame()
df_claim_and_premise['docno'] = tagged_df['docno']
df_claim_and_premise['text'] = text_claim_and_premise
df_claim_and_premise

Unnamed: 0,docno,text
0,S1f6b58eb-A5c530110,"This week I have my hair in braids , much lik..."
1,S1a9db4fc-Acc4206f5,UN Women elected its first executive board Th...
2,S1a924cd4-A994e77ca,"Madam , October is Women 's History Month and..."
3,S1a3be6ac-A873cd7aa,"I rise today to recognize Frederick Couchie ,..."
4,S1beff81f-A1206f3ee,I find the continued existence of and the inc...
...,...,...
338615,Sca72da7d-A5cd811ac,the world would be a better place on self-este...
338616,Sca72da7d-Af008e63e,will be good enough to make up for the one to ...
338617,Sca72da7d-Ac77ea7a1,"a forfeit How disappointing , I did n't see t..."
338618,Sca72da7d-Ae83ab999,an It in some cases it is of course perfectly ...


In [14]:
len_orig = []
len_tag = []

i = 0
for tag_text, orig_text in zip(df_claim_and_premise['text'], df['text']):
    len_orig.append(len(tokenizer.tokenize(orig_text)))
    len_tag.append(len(tokenizer.tokenize(tag_text)))
    if(len(tokenizer.tokenize(orig_text)) == 0):
        print(i)
        print(orig_text)
        print(tag_text)
        print('-------')
        break
    i += 1

In [15]:
lens = np.array([len_orig, len_tag])
print(f'Mean of original lenghtes in tokens: {np.mean(len_orig)}')
print(f'Mean of tagged lenghtes in tokens: {np.mean(len_tag)}')

print(f'Variance of original lenghtes in tokens: {np.var(len_orig)}')
print(f'Variance of tagged lenghtes in tokens: {np.var(len_tag)}')

print(f'Standard deviation of original lenghtes in tokens: {np.std(len_orig)}')
print(f'Standard deviation of tagged lenghtes in tokens: {np.std(len_tag)}')

print(f'Median of original lenghtes in tokens: {np.median(len_orig)}')
print(f'Median of tagged lenghtes in tokens: {np.median(len_tag)}')

print(f'Max of original lenghtes in tokens: {np.max(len_orig)}')
print(f'Max of tagged lenghtes in tokens: {np.max(len_tag)}')

print(f'Min of original lenghtes in tokens: {np.min(len_orig)}')
print(f'Min of tagged lenghtes in tokens: {np.min(len_tag)}')


Mean of original lenghtes in tokens: 316.5804323430394
Mean of tagged lenghtes in tokens: 219.38701494300395
Variance of original lenghtes in tokens: 165157.84620620374
Variance of tagged lenghtes in tokens: 86206.95720957032
Standard deviation of original lenghtes in tokens: 406.3961690348517
Standard deviation of tagged lenghtes in tokens: 293.6102130539235
Median of original lenghtes in tokens: 140.0
Median of tagged lenghtes in tokens: 94.0
Max of original lenghtes in tokens: 16751
Max of tagged lenghtes in tokens: 15154
Min of original lenghtes in tokens: 1
Min of tagged lenghtes in tokens: 0
