In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
%cd /content/gdrive/MyDrive/DL/Facebook/fbscraper/nytimes

In [None]:
import pickle as pkl
import pandas as pd 
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
nltk.download('punkt') # For tokenizers
from nltk.tokenize import TweetTokenizer

In [None]:
posts = []

with open("nytimes.pkl", "rb") as f:
    try:
        while True:
            posts.append(pkl.load(f))
    except EOFError:
        pass

In [None]:
texts = [] # {text, authorName, authorURL, time, type}

for post in posts:
    texts.append(dict(
        text=post["text"],
        authorName=post["author"]["name"],
        authorURL=post["author"]["url"],
        time_=post["time"],
        type_="post"
    ))
    for comment in post["comments"]:
        texts.append(dict(
            text=comment["text"],
            authorName=comment["author"]["name"],
            authorURL=comment["author"]["url"],
            time_=post["time"],
            type_="comment"
        ))
        for reply in comment["replies"]:
            texts.append(dict(
                text=reply["text"],
                authorName=reply["author"]["name"],
                authorURL=reply["author"]["url"],
                time_=post["time"],
                type_="reply"
            ))

In [None]:
len(texts)

In [None]:
df = pd.DataFrame(texts)

In [None]:
df

In [None]:
# Pre-processing text

tknz = TweetTokenizer()

def cleanText(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("-", " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("\u2018", "X", text) 
    text = re.sub("\u2019", "X", text) 
    text = re.sub("\'", "X", text) 
    wordTokens_ = tknz.tokenize(text)
    wordTokens = list()
    for x in wordTokens_:
        x = ''.join([v for v in x if v.isalnum() or v == ' '])
        if len(x) > 0 and x != 'X':
            x = x.replace('X', '\'')
            wordTokens.append(x)
    return ' '.join(wordTokens)

In [None]:
df["processedText"] = df["text"].apply(cleanText)

In [None]:
df.to_csv("nytimes.csv")

In [None]:
nComments = []
nReplies = []

for post in posts:
    nComments.append(len(post["comments"]))
    for comment in post["comments"]:
        nReplies.append(len(comment["replies"]))

In [None]:
nComments = np.array(nComments)
nReplies = np.array(nReplies)

In [None]:
nCommentsAvg = np.average(nComments)
nCommentsMed = np.median(nComments)
nRepliesAvg = np.average(nReplies)
nRepliesMed = np.median(nReplies)
print(nCommentsAvg, nCommentsMed, nRepliesAvg, nRepliesMed)

In [None]:
authorName = df['authorName'].tolist()

In [None]:
authorNameDist = dict()
for name in authorName:
    try:
        authorNameDist[name] += 1
    except:
        authorNameDist[name] = 1

In [None]:
authorNameDistSorted = []
commits = []
for k, v in authorNameDist.items():
    authorNameDistSorted.append((v, k))
    commits.append(v)
authorNameDistSorted = sorted(authorNameDistSorted, reverse=True)
for i, name in enumerate(authorNameDistSorted[:10]):
    print(name)

In [None]:
Model Accuracy Precision (AH) Recall (AH) F1 (AH) Precision (None) Recall (None) F1 (None) Macro F1
BERT (10%) 0.721 0.796 0.594 0.680 0.676 0.848 0.752 0.716
BERT (20%) 0.785 0.786 0.782 0.784 0.783 0.787 0.785 0.784
BERT (30%) 0.804 0.812 0.790 0.801 0.796 0.817 0.806 0.803
BERT (40%) 0.811 0.794 0.840 0.816 0.830 0.782 0.805 0.811
BERT (50%) 0.824 0.801 0.860 0.829 0.848 0.787 0.817 0.823
BERT (60%) 0.831 0.818 0.850 0.834 0.844 0.811 0.827 0.830
BERT (70%) 0.836 0.835 0.838 0.836 0.837 0.834 0.836 0.836
BERT (80%) 0.832 0.838 0.822 0.830 0.825 0.841 0.833 0.831
BERT (90%) 0.837 0.845 0.826 0.835 0.830 0.848 0.839 0.837
BERT (100%) 0.839 0.837 0.840 0.839 0.840 0.837 0.838 0.838
GAN-BERT (10%) 0.777 0.767 0.796 0.781 0.788 0.758 0.773 0.777
GAN-BERT (20%) 0.792 0.802 0.775 0.788 0.782 0.809 0.795 0.792
GAN-BERT (30%) 0.798 0.796 0.802 0.799 0.800 0.794 0.797 0.798
GAN-BERT (40%) 0.798 0.788 0.815 0.801 0.808 0.781 0.794 0.798
GAN-BERT (50%) 0.807 0.813 0.797 0.805 0.801 0.817 0.809 0.807
GAN-BERT (60%) 0.813 0.818 0.804 0.811 0.807 0.821 0.814 0.812
GAN-BERT (70%) 0.814 0.807 0.824 0.815 0.820 0.803 0.812 0.814
GAN-BERT (80%) 0.820 0.825 0.811 0.818 0.814 0.828 0.821 0.820
GAN-BERT (90%) 0.818 0.805 0.839 0.822 0.831 0.797 0.814 0.805
GAN-BERT (100%) 0.839 0.837 0.840 0.839 0.840 0.837 0.838 0.838

In [None]:
x = [0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

y1 = [0.597, 0.514, 0.739, 0.728, 0.716, 0.751, 0.745, 0.756, 0.772, 0.784, 0.803, 0.811, 0.823, 0.830, 0.836, 0.831, 0.837, 0.838] #bert

y2 = [0.677, 0.745, 0.739, 0.752, 0.766, 0.782, 0.777, 0.792, 0.789, 0.792, 0.798, 0.798, 0.807, 0.812, 0.814, 0.820, 0.805, 0.838] #ganbert


In [None]:
plt.xlabel('fraction of labeled instances in training set')
plt.ylabel('macro-F1 score')
plt.plot(x, y1, 'o-', label='BERT')
plt.plot(x, y2, 'o-', label='GAN-BERT')
plt.grid()
plt.legend()
plt.savefig('bert_vs_ganbert.eps', format='eps')
#plt.savefig(save_as)
#plt.close()