In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle

## Classification accuracy

In [None]:
comments = pickle.load(open('/content/gdrive/MyDrive/DL/Facebook/dataset/classified_comments.pkl', 'rb'))

In [None]:
comments[0]

In [None]:
comments = sorted(comments, key = lambda x: x['score'])
top_ah_comments = []
top_none_comments = []

for i in range(1000):
    top_ah_comments.append(comments[i])
    top_none_comments.append(comments[-(i + 1)])

In [None]:
np.random.RandomState(seed=42).shuffle(top_ah_comments)
np.random.RandomState(seed=42).shuffle(top_none_comments)

X = []
Y = []

for x in top_ah_comments:
    c = len(x['text'].strip().split())
    if c >= 20 and c <= 80:
        X.append(x)

for x in top_none_comments:
    c = len(x['text'].strip().split())
    if c >= 20 and c <= 80:
        Y.append(x)

pageX = dict()
pageY = dict()

for x in X:
    try:
        pageX[x['page']].append(x)
    except:
        pageX[x['page']] = []
        pageX[x['page']].append(x)

for y in Y:
    try:
        pageY[y['page']].append(y)
    except:
        pageY[y['page']] = []
        pageY[y['page']].append(y)

In [None]:
pages = ['DonaldTrump', 'FoxNews', 'Breitbart', 'joebiden', 'barackobama']
groups = []

cnt = dict()

for k in pages:
    cnt[k] = 0

for i in range(5):
    group = []
    for k in pages:
        group.append(pageX[k][cnt[k]])
        group.append(pageY[k][cnt[k]])
        cnt[k] += 1
    for k in pages:
        group.append(pageX[k][cnt[k]])
        group.append(pageY[k][cnt[k]])
        cnt[k] += 1
    np.random.RandomState(seed=i+40).shuffle(group)
    groups.append(group)
    
np.random.RandomState(seed=42).shuffle(groups)

In [None]:
cols = ['Do you think this is an ad-hominem comment?']
for i in range(1, 20):
    cols.append(cols[0] + '.' + str(i))

In [None]:
y_pred = []

for i in range(5):
    for x in groups[i]:
        if x['score'] < 0.5:
            y_pred.append(1)
        else:
            y_pred.append(0)

y_pred = np.array(y_pred)

In [None]:
addr = '/content/gdrive/MyDrive/DL/Facebook/annotations/batch{}.csv'
y = []

for i in range(1, 6):
    df = pd.read_csv(addr.format(i))
    for x in cols:
        lbl = df[x].value_counts().idxmax()
        if lbl == 'Yes':
            y.append(1)
        else:
            y.append(0)
            
y = np.array(y)

In [None]:
accuracy_score(y, y_pred)

In [None]:
precision_score(y, y_pred)

In [None]:
recall_score(y, y_pred)

In [None]:
f1_score(y, y_pred)

# Phrase precision on Create Debate

In [None]:
df = pd.read_csv(addr.format(1))
s = df.columns[3]

cols = [s]
for i in range(1, 20):
    cols.append(cols[0] + '.' + str(i))

In [None]:
# collecting the phrases from the models

phrases_bert = []
for batch in range(1, 6):
    content = open(f'/content/gdrive/MyDrive/DL/trigrams/createdebate/batch{batch}/trigrams.txt', 'r', encoding='utf-8').read()
    content = content.split('@#$$#@@#@@#')
    for x in content:
        x = x.split('$#$#$#$#$#$#@@@@')
        x.append('None')
        phrases_bert.append(x)

In [None]:
# collecting the phrases from the participants

phrases_part = [list() for i in range(100)]

for i in range(5):
    df = pd.read_csv(f'/content/gdrive/MyDrive/DL/CreateDebate/Politics/annotations/batch{i + 1}.csv')
    # print(len(df))
    for _, df_usr in df.iterrows():
        for j in range(20):
            comment_id = i * 20 + j
            phrases = df_usr[cols[j]].split(',')
            # print(phrases)
            phrases_part[comment_id].append(phrases)

In [None]:
total_cases = 0
sel_cases = 0

for cid in range(100):
    bert_p = phrases_bert[cid]
    users_p = phrases_part[cid] 
    for user_p in users_p:
        total_cases += 1
        ok = False
        for p in user_p:
            ok = ok or (p in bert_p)
        if ok:
            sel_cases += 1
        else:
            print(user_p, bert_p)
            print()

print(100 * sel_cases / total_cases)
print(sel_cases, total_cases)

## Phrase precision on Facebook

In [None]:
df = pd.read_csv(addr.format(1))

In [None]:
s = df.columns[3]

In [None]:
type(s)

In [None]:
cols = [s]
for i in range(1, 20):
    cols.append(cols[0] + '.' + str(i))

In [None]:
# collecting the phrases from the models

phrases_bert = []
for batch in range(1, 6):
    content = open(f'/content/gdrive/MyDrive/DL/trigrams/facebook/batch{batch}/trigrams.txt', 'r', encoding='utf-8').read()
    content = content.split('@#$$#@@#@@#')
    for x in content:
        x = x.split('$#$#$#$#$#$#@@@@')
        x.append('None')
        phrases_bert.append(x)

In [None]:
len(phrases_bert)

In [None]:
# collecting the phrases from the participants

phrases_part = [list() for i in range(100)]

for i in range(5):
    df = pd.read_csv(f'/content/gdrive/MyDrive/DL/Facebook/annotations/batch{i + 1}.csv')
    # print(len(df))
    for _, df_usr in df.iterrows():
        for j in range(20):
            comment_id = i * 20 + j
            phrases = df_usr[cols[j]].split(',')
            # print(phrases)
            phrases_part[comment_id].append(phrases)

In [None]:
'a' in 'abde'

In [None]:
total_cases = 0
sel_cases = 0

for cid in range(100):
    bert_p = phrases_bert[cid]
    users_p = phrases_part[cid] 
    for user_p in users_p:
        total_cases += 1
        ok = False
        for p in user_p:
            ok = ok or (p in bert_p)
        if ok:
            sel_cases += 1
        else:
            print(user_p, bert_p)
            print()

print(sel_cases, total_cases)

In [None]:
(sel_cases / total_cases) * 100