# Analyzing The New York Times FB page activity during 2016 US Presidential Debates



In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
%cd /content/gdrive/MyDrive/DL/Facebook/fbscraper/nytimes/2016

In [None]:
import pickle as pkl
import pandas as pd 
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
nltk.download('punkt') # For tokenizers
from nltk.tokenize import TweetTokenizer
import urllib.parse
from tqdm import tqdm

In [None]:
# loading the posts into the memory 

posts = []

with open("2016pre.pkl", "rb") as f:
    try:
        while True:
            posts.append(pkl.load(f))
    except EOFError:
        pass

with open("2016post.pkl", "rb") as f:
    try:
        while True:
            posts.append(pkl.load(f))
    except EOFError:
        pass

In [None]:
len(posts)

In [None]:
posts[0] # how post is stored in the list

In [None]:
# transitioning from post-level data to entry-level data

texts = [] # {text, authorName, authorURL, time, type}

err = 0

for post in posts:
    try:
        texts.append(dict(
            text=post["text"],
            authorName=post["author"]["name"],
            authorURL=post["author"]["url"],
            time_=post["time"],
            type_="post"
        ))
    except:
        err += 1
    for comment in post["comments"]:
        try:
            texts.append(dict(
                text=comment["text"],
                authorName=comment["author"]["name"],
                authorURL=comment["author"]["url"],
                time_=post["time"],
                type_="comment"
            ))
        except:
            err += 1
        for reply in comment["replies"]:
            try:
                texts.append(dict(
                    text=reply["text"],
                    authorName=reply["author"]["name"],
                    authorURL=reply["author"]["url"],
                    time_=post["time"],
                    type_="reply"
                ))
            except:
                err += 1

In [None]:
len(texts), err

In [None]:
df = pd.DataFrame(texts)

In [None]:
df

In [None]:
# Pre-processing text

tknz = TweetTokenizer()

def cleanText(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("-", " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("\u2018", "X", text) 
    text = re.sub("\u2019", "X", text) 
    text = re.sub("\'", "X", text) 
    wordTokens_ = tknz.tokenize(text)
    wordTokens = list()
    for x in wordTokens_:
        x = ''.join([v for v in x if v.isalnum() or v == ' '])
        if len(x) > 0 and x != 'X':
            x = x.replace('X', '\'')
            wordTokens.append(x)
    return ' '.join(wordTokens)

In [None]:
df["processedText"] = df["text"].apply(cleanText)

In [None]:
df

In [None]:
# extracting unique ID for the users

def extractUid(url):
    url = url[25:]
    url = url.replace('/', '')
    url = url.split('?')
    if url[0] != 'profile.php':
        return url[0]
    assert len(url) == 2
    foo = urllib.parse.parse_qs(url[1])
    return foo['id'][0]

In [None]:
df["authorID"] = df["authorURL"].apply(extractUid)

In [None]:
df

In [None]:
df.to_csv("2016.csv")

In [None]:
nComments = []
nReplies = []

for post in posts:
    nComments.append(len(post["comments"]))
    for comment in post["comments"]:
        nReplies.append(len(comment["replies"]))

In [None]:
nComments = np.array(nComments)
nReplies = np.array(nReplies)

In [None]:
nCommentsAvg = np.average(nComments)
nCommentsMed = np.median(nComments)
nRepliesAvg = np.average(nReplies)
nRepliesMed = np.median(nReplies)
print(nCommentsAvg, nCommentsMed, nRepliesAvg, nRepliesMed)

In [None]:
authorID = df['authorID'].tolist()

In [None]:
len(authorID)

In [None]:
# for extracting unique users

authorIDDist = dict()

for ID in authorID: 
    try: 
        authorIDDist[ID] += 1 
    except KeyError: 
        authorIDDist[ID] = 1

In [None]:
len(authorIDDist)

In [None]:
authorIDDistSorted = list()

for k, v in authorIDDist.items():
    authorIDDistSorted.append((v, k))
authorIDDistSorted = sorted(authorIDDistSorted, reverse=True) 

for entryCount, ID in authorIDDistSorted[:10]:
    print(f'{ID:30} - {entryCount}')