In [None]:
# import the necessary packages
from facebook_scraper import get_posts
import seaborn as sns
import pandas as pd
import numpy as np
import json

from AttributeRelevance import *
from Features import *

# Part - 1: Data Collection

In [None]:
# define a function to scrape facebook posts
def scrape_facebook(accounts):
    posts = []
    for account in accounts:
        for post in get_posts(account):
            post_dict = {}
            post_dict.update({"Text":post['text']})
            post_dict.update({"Likes":post['likes']})
            post_dict.update({"Comments":post['comments']})
            post_dict.update({"Shares":post['shares']})
            post_dict.update({"Account":account})
            # TODO - get subscribers number for a given acccounts
            post_dict.update({"Subscribers":post['subscribers']})
            posts.append(post_dict)
    return posts

In [None]:
# facebook pages tobe scrape which arerelated to SRH and GBV
fb_page_names = ['SRHMJournal', 'FSRH.UK', 'mmsa.scora', 'BMJ.SRH', 
                 'BLreproductivehealth', 'actioncanadaSHR', 'GBVPrevNetwork', 
                 'StopGBVatWork', '16DaysCampaign', 'SayNO.UNiTE']

In [None]:
# scrape facebook posts related to SRH and GBV
start = time.time()
posts = scrape_facebook(fb_page_names)
end = time.time()
print(f"data scraping takes: {(end - start)} seconds")
fb_posts_df = pd.DataFrame(posts)
print(fb_posts_df.head())
print(fb_posts_df.shape)

In [None]:
# TODO - joining the scrapped dataframes

In [None]:
fb_posts_df.to_csv('../data/fb_posts.csv', index=False)

# Part - 2: Data Cleaning and Transforming

## Text cleaning

In [None]:
# drop amharic posts
df = df[df['Text'].apply(lambda txt: not re.search(r"[\u1200-\u137F]+", str(txt)))]
df.head()

In [2]:
# TODO - content based cleaning 
# define keyword group count
def keyword_group_count(txt):
    count = 0
    for keyword_list in keywords.values():
        for keyword in keyword_list:
            if str(keyword).lower() in txt.lower():
                count += 1`
                break
    return count

# add keyword group count columns
df["keyword_group_count"] = df["Text"].apply(lambda txt: keyword_group_count(txt))
df = df[df["keyword_group_count"] >= 5]
df.head()

## Labeling

In [None]:
# label the posts based on the mean of comments, likes, and shares
df[["Comments", "Likes", "Shares"]] = df[["Comments", "Likes", "Shares"]]/df["Subscribers"]

df["Comments"] = df["Comments"].apply(lambda x: 1 if x >= df["Comments"].mean() else 0)
df["Likes"] = df["Likes"].apply(lambda x: 1 if x >= df["Likes"].mean() else 0)
df["Shares"] = df["Shares"].apply(lambda x: 1 if x >= df["Shares"].mean() else 0)

df["label"] = df[["Comments", "Likes", "Shares"]].mean()
df["label"] = df["label"].apply(lambda x: 1 if x >= 0.5 else 0)

df.head()

## Add sub-topics columns

In [None]:
# add content_size column
def classifiy_content_size(content_size, quantieles):
    if content_size <= quantieles[0]:
        return "short"
    elif content_size >= quantieles[2]:
        return "long"
    else:
        return "meduim"

# extract text content size 
df["content_size"] = df["Text"].apply(lambda txt: len(txt))
(q1, q2, q3, q4) = df["content_size"].quanitile() # TODO - How to find quantile in pandas
df["content_size"] = df["content_size"].apply(lambda content_size: classifiy_content_size(content_size, [q1, q2, q3, q4]))
df.head()

In [None]:
# add sub_topics columns
# TODO - find the relevant sub_topics
sub_topics = {"sub_topics1": [],
              "sub_topics2": [],
              "sub_topics3": []}

for (sub_topic_name, sub_topic_list) in sub_topics.items():
    df[sub_topic_name] = df['Text'].apply(lambda txt: ','.join([str(sub_topic) for sub_topic in sub_topic_list \
                                                if (sub_topic.lower() in str(txt).lower())]))
    
df.head()

In [None]:
# Transform each element of a list-like to a row, replicating index values.
for sub_topic_name in sub_topics.keys():
    df['keyword'] = df['keyword'].apply(lambda x: x.split(','))
    df = df.explode('keyword')
df = df.reset_index().drop(columns='index')
df.head()

In [None]:
df.to_csv('../data/cleaned_fb_posts.csv')

# Part - 3: Data Analysis

## Convert sub-topics to bins

In [None]:
feats_dict = {}

for col in [c for c in df.columns if (c != 'label' and c != 'Text')]:
    feats_dict[col] = CategoricalFeature(df, col)

feats = list(feats_dict.values())
feats_dict[sub_topic_name].df_lite

## Calculate woe and iv

In [None]:
iv = IV()
ar = AttributeRelevance()

In [None]:
df, df_sum = iv.calculate_iv(feats_dict['keyword'])
print(df.head())
print("iv sum: ", df_sum)

In [None]:
ar.analyze(feats, iv, interpretation=True)

## Visualize woe and iv

In [None]:
for sub_topic_name in sub_topics.keys():
    iv.visualize(feats_dict[sub_topic_name], 'woe')

In [None]:
for sub_topic_name in sub_topics.keys():
    iv.visualize(feats_dict[sub_topic_name], 'iv')

## Save the woe and iv

In [None]:
df = df[[sub_topics_name, 'woe', 'iv']]
df.to_csv('data/woe_iv_data.csv')