In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
import numpy as np
import pandas as pd
import pickle
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
%cd /content/gdrive/MyDrive/DL/Facebook/dataset

In [None]:
comment_count = dict() # author: count 
reply_count = dict()   # author: count
interaction = dict()   # (author1, author2): cnt

In [None]:
def get_comments(country_name, page_name):
    df = pd.read_json(f'{country_name}/{page_name}/Page Posts (Scraped User Comments).json')
    df = df[['Comment Text', 'Post Link', 'Replies', 'Username', 'Comment Time']]
    df = df.dropna(subset=['Comment Text', 'Post Link'])
    texts = df['Comment Text'].tolist()
    links = df['Post Link'].tolist()
    replies = df['Replies'].tolist()
    usrnames = df['Username'].tolist()
    times = df['Comment Time'].tolist()
    for i in range(len(df)):
        comment = dict()
        comment['text'] = texts[i]
        comment['link'] = links[i]
        comment['page'] = page_name
        comment['type'] = 'comment'
        comment['username'] = usrnames[i]
        comment['time'] = times[i]
        lst = replies[i]
        try:
            comment_count[usrnames[i]] += 1
            reply_count[usrnames[i]] += len(lst)
        except:
            comment_count[usrnames[i]] = 1
            reply_count[usrnames[i]] = len(lst)
        for j in range(len(lst)):
            r = dict()
            r['text'] = lst[j]['Comment Text']
            r['link'] = links[i]
            r['page'] = page_name
            r['type'] = 'reply'
            r['username'] = lst[j]['Username']
            r['time'] = lst[j]['Comment Time']
            try:
                interaction[(comment['username'], r['username'])] += 1
            except:
                interaction[(comment['username'], r['username'])] = 1

In [None]:
page_names = ['barackobama', 'Breitbart', 'DonaldTrump', 'FoxNews', 'joebiden']

for page_name in page_names:
    get_comments('USA', page_name)

In [None]:
s = []
for k, v in comment_count.items():
    s.append((v, k))
s = sorted(s, reverse=True)

In [None]:
print('Top users\t\tComment Frequency')
print('-' * 50)
for i in range(20):
    print(f'{s[i][1]:30}\t{s[i][0]}')

In [None]:
cnt = [0] * (1000)
for k, v in comment_count.items():
    cnt[v] += 1
x = []
for i in range(1000):
    x.append(i)
plt.figure(dpi = 100)
plt.plot(x[1:50], cnt[1:50])
plt.xlabel('Comment count')
plt.ylabel('Number of authors')
plt.title('Distribution of Authors')
plt.show()

In [None]:
v = dict()
v['1'] = cnt[1]
v['2'] = cnt[2]
v['3-5'] = cnt[3] + cnt[4] + cnt[5]
v['6-10'] = 0
for i in range(6, 11):
    v['6-10'] += cnt[i]
v['11-20'] = 0
for i in range(11, 21):
    v['11-20'] += cnt[i]
v['>20'] = 0
for i in range(21, len(cnt)):
    v['>20'] += cnt[i]

x = ['1', '2', '3-5', '6-10', '11-20', '>20']
y = []
for xx in x:
    y.append(v[xx])
plt.bar(x, y)
plt.xlabel('Number of level-1 arguments')
plt.ylabel('Number of authors')
plt.savefig('distribution_of_authors_wrt_level_1_comments_facebook.eps', format='eps')
print(v)

In [None]:
s = []
for k, v in reply_count.items():
    s.append((v, k))
s = sorted(s, reverse=True)
print('Tops users\t\tDirect replies')
print('-' * 30)
for i in range(20):
    print(f'{s[i][1]:30}{s[i][0]}')

In [None]:
cnt = [0] * (1000)
for k, v in comment_count.items():
    cnt[v] += 1
x = []
for i in range(1000):
    x.append(i)
plt.figure(dpi = 100)
plt.plot(x[1:50], cnt[1:50])
plt.xlabel('Reply count')
plt.ylabel('Number of authors')
plt.title('Distribution of Authors')
plt.show()

In [None]:
v = dict()
v['1'] = cnt[1]
v['2'] = cnt[2]
v['3-5'] = cnt[3] + cnt[4] + cnt[5]
v['6-10'] = 0
for i in range(6, 11):
    v['6-10'] += cnt[i]
v['11-20'] = 0
for i in range(11, 21):
    v['11-20'] += cnt[i]
v['>20'] = 0
for i in range(21, len(cnt)):
    v['>20'] += cnt[i]
x = ['1', '2', '3-5', '6-10', '11-20', '>20']
y = []
for xx in x:
    y.append(v[xx])
plt.bar(x, y)
plt.xlabel('Number of direct replies')
plt.ylabel('Number of authors')
plt.savefig('distribution_of_authors_wrt_direct_replies_facebook.eps', format='eps')
print(v)

In [None]:
def build_graph(n1=0, n2=0):
    """Create network of users for link farming detection"""
    A = [] # list of authors with > n1 level-1 comments, and > n2 direct replies

    candidates = []
    for k, v in comment_count.items():
        if v >= n1:
            candidates.append(k)

    for x in candidates:
        try:
            r_count = reply_count[x]
            if r_count >= n2:
                A.append(x)
        except:
            pass
    
    author_map = dict() # contains unique id for any author
    rev_map = ["" for x in range(len(A))] # used to extract author's name, given the id
    author_count = len(A)

    for i in range(author_count):
        rev_map[i] = A[i]
        author_map[A[i]] = i

    matrix = [[0 for j in range(author_count)] for i in range(author_count)]

    for i in range(author_count):
        for j in range(author_count):
            try:
                matrix[i][j] = interaction[(A[i], A[j])]
            except:
                pass

    ntwrk = nx.DiGraph()

    for i in range(author_count):
        for j in range(author_count):
            if matrix[i][j] != 0:
                ntwrk.add_weighted_edges_from([(i, j, matrix[i][j])])

    return matrix, ntwrk, author_map, rev_map, author_count

In [None]:
lambdas = [1, 2, 5, 10, 20]
rhos = [1, 2, 5, 10, 20, 50, 100]

lambdas_str, rhos_str = [], []

for l in lambdas:
    lambdas_str.append(str(l))
for r in rhos:
    rhos_str.append(str(r))

L = len(lambdas)
R = len(rhos)

count = [[0 for j in range(R)] for i in range(L)]
reciprocity = [[0 for j in range(R)] for i in range(L)]
n_scc = [[0 for j in range(R)] for i in range(L)]

with tqdm(total=L*R) as pbar:
    for i in range(L):
        for j in range(R):
            matrix, ntwrk, author_map, rev_map, author_count = build_graph(lambdas[i], rhos[j])
            count[i][j] = author_count
            reciprocity[i][j] = nx.algorithms.reciprocity(ntwrk)
            n_scc[i][j] = nx.number_strongly_connected_components(ntwrk)
            pbar.update(1)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(count, interpolation='nearest')
fig.colorbar(cax)
ax.set_xticks(np.arange(R))
ax.set_yticks(np.arange(L))
ax.set_xticklabels(rhos_str)
ax.set_yticklabels(lambdas_str)
ax.set_ylabel("$\lambda$", rotation='horizontal')
ax.set_xlabel("$\\rho$")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.savefig('variation_in_number_of_authors_facebook.eps', format='eps')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(reciprocity, interpolation='nearest')
fig.colorbar(cax)
ax.set_xticks(np.arange(R))
ax.set_yticks(np.arange(L))
ax.set_xticklabels(rhos_str)
ax.set_yticklabels(lambdas_str)
ax.set_ylabel("$\lambda$", rotation='horizontal')
ax.set_xlabel("$\\rho$")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.savefig('variation_in_reciprocity_facebook.eps', format='eps')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(n_scc, interpolation='nearest')
fig.colorbar(cax)
ax.set_xticks(np.arange(R))
ax.set_yticks(np.arange(L))
ax.set_xticklabels(rhos_str)
ax.set_yticklabels(lambdas_str)
ax.set_ylabel("$\lambda$", rotation='horizontal')
ax.set_xlabel("$\\rho$")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.savefig('variation_in_number_of_scc_facebook.eps', format='eps')