# Preparing Facebook dataset
* __Objective__: Create facebook dataset for ad hominem detection using Bert
* __File Management__: Using Google Drive
* __Runtime Type__: CPU

In [None]:
!pip install langid

In [None]:
import pandas as pd
import numpy as np
#import langid
from tqdm import tqdm
import pickle
#from langid.langid import LanguageIdentifier, model

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
%cd /content/gdrive/MyDrive/DL/Facebook/dataset

In [None]:
page_names_usa = ['barackobama', 'Breitbart', 'DonaldTrump', 'FoxNews', 'joebiden']
page_names_india = ['BJP4India', 'IndianNationalCongress', 'rahulgandhi', 'RepublicBharatHindi', 'narendramodi']
comments = []

In [None]:
def get_posts(country_name, page_name):
    df = pd.read_json(f'{country_name}/{page_name}/Page Posts.json')
    df = df[['Post Text', 'Post Link', 'Post Owner', 'Date Posted']]
    df = df.dropna(subset=['Post Text', 'Post Link'])
    texts = df['Post Text'].tolist()
    links = df['Post Link'].tolist()
    times = df['Date Posted'].tolist()
    incomments = []
    for i in range(len(df)):
        comment = dict()
        comment['text'] = texts[i]
        comment['link'] = links[i]
        comment['page'] = page_name
        comment['type'] = 'post'
        comment['username'] = page_name
        comment['time'] = times[i]
        incomments.append(comment)
    return incomments

In [None]:
def get_comments(country_name, page_name):
    df = pd.read_json(f'{country_name}/{page_name}/Page Posts (Scraped User Comments).json')
    df = df[['Comment Text', 'Post Link', 'Replies', 'Username', 'Comment Time']]
    df = df.dropna(subset=['Comment Text', 'Post Link'])
    texts = df['Comment Text'].tolist()
    links = df['Post Link'].tolist()
    replies = df['Replies'].tolist()
    usrnames = df['Username'].tolist()
    times = df['Comment Time'].tolist()
    incomments = []
    for i in range(len(df)):
        comment = dict()
        comment['text'] = texts[i]
        comment['link'] = links[i]
        comment['page'] = page_name
        comment['type'] = 'comment'
        comment['username'] = usrnames[i]
        comment['time'] = times[i]
        incomments.append(comment)
        lst = replies[i]
        for j in range(len(lst)):
            r = dict()
            r['text'] = lst[j]['Comment Text']
            r['link'] = links[i]
            r['page'] = page_name
            r['type'] = 'reply'
            r['username'] = lst[j]['Username']
            r['time'] = lst[j]['Comment Time']
            incomments.append(r)
    return incomments

In [None]:
# preparing US dataset
for page_name in page_names_usa:
    comments.extend(get_posts('USA', page_name))
    comments.extend(get_comments('USA', page_name))

In [None]:
len(comments)

In [None]:
# preparing India dataset
for page_name in page_names_india:
    comments.extend(get_posts('India', page_name))
    comments.extend(get_comments('India', page_name))

In [None]:
len(comments)

In [None]:
comments[0]

In [None]:
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
english_comments = []

In [None]:
for i in tqdm(range(len(comments))):
    x = comments[i]
    lang, score = identifier.classify(x['text'])
    if lang == 'en' and score >= 0.9:
        english_comments.append(x)

In [None]:
len(english_comments)

In [None]:
with open('comments.pkl', 'wb') as f:
    pickle.dump(english_comments, f)

In [None]:
w_comments = pickle.load(open('comments.pkl', 'rb'))

In [None]:
w_comments[0]

In [None]:
authors = set()
times = set()
cnt = 0
for x in w_comments:
    if x['page'] in page_names_usa:
        cnt += 1
        authors.add(x['username'])

In [None]:
maxtime = 0
mintime = 20221212
for x in times:
    maxtime = max(maxtime, x)
    mintime = min(mintime, x)
print(mintime, maxtime)

In [None]:
len(authors)

In [None]:
cnt