In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from facebook_scraper import get_posts
import re
import math
import json
import time

# Data Scrapping

In [2]:
# define a function to scrape facebook posts
def fb_scrape(page_names):
        posts = []
        for page_name in page_names:
            for post in get_posts(page_name):
                post_dict = {}
                post_dict.update({"Text":post['text']})
                post_dict.update({"Likes":post['likes']})
                post_dict.update({"Comments":post['comments']})
                post_dict.update({"Shares":post['shares']})
                post_dict.update({"Account":page_name})
                post_dict.update({"Subscribers":post['subscribers']})
                posts.append(post_dict)
        return posts

In [3]:
# srhr_df = pd.read_excel('data/SRHR social media handel .xlsx')
# srhr_df.head()

In [4]:
# fb_page_names = srhr_df['Facebook'].dropna()
# fb_page_names = list(fb_page_names)
# fb_page_names = [fb_page_name.replace('-', '').replace(' ', '').replace('@', '') for fb_page_name in fb_page_names]
# fb_page_names

In [5]:
# facebook pages related to SRH and GBV
fb_page_names = ['SRHMJournal', 'FSRH.UK', 'mmsa.scora', 'BMJ.SRH', 
                 'BLreproductivehealth', 'actioncanadaSHR', 'GBVPrevNetwork', 
                 'StopGBVatWork', '16DaysCampaign', 'SayNO.UNiTE']

In [6]:
# scrape facebook posts related to SRH and GBV
start = time.time()
posts = fb_scrape(fb_page_names)
end = time.time()
print(f"data scraping takes: {(end - start)} seconds")
fb_posts_df = pd.DataFrame(posts)
fb_posts_df.head()
fb_posts_df.to_csv('data/fb_posts.csv', index=False)
fb_posts_df.shape

In [7]:
df = pd.read_csv("data/fb_posts.csv")
df.head()

Unnamed: 0,Comments,Likes,Shares,Text
0,0,0,0,"""Unmet need for #contraception was 34% for the..."
1,1,3,0,The handbook challenges readers to consider #m...
2,0,0,0,"""In #Tanzania, 21.7% of women desire to delay ..."
3,0,3,0,This article focuses on barriers #women face a...
4,0,0,0,New research article now online: Measuring unm...


In [8]:
df.shape

(386, 4)

In [None]:
df[["Comments", "Likes", "Shares"]] = df[["Comments", "Likes", "Shares"]]/df["Subscribers"]

df["Comments"] = df["Comments"].apply(lambda x: 1 if x >= df["Comments"].mean() else 0)
df["Likes"] = df["Likes"].apply(lambda x: 1 if x >= df["Likes"].mean() else 0)
df["Shares"] = df["Shares"].apply(lambda x: 1 if x >= df["Shares"].mean() else 0)

df["label"] = df[["Comments", "Likes", "Shares"]].mean()
df["label"] = df["label"].apply(lambda x: 1 if x >= 0.5 else 0)

df.head()

# Data Cleaning

## drop amharic posts

In [9]:
df = df[df['Text'].apply(lambda txt: not re.search(r"[\u1200-\u137F]+", str(txt)))]
df.sample(5)

Unnamed: 0,Comments,Likes,Shares,Text
83,0,1,0,"WED, DEC 16, 2020\nSexual Health Campaign Work..."
6,0,3,0,Comprehensive sexuality education is fundament...
63,0,3,0,A very Happy Holidays from the FSRH staff! A h...
327,1,9,0,"At Global 16 Days Campaign, we believe that a ..."
134,0,2,0,"New in our SRH Clinical Consult series:\n\n""Se..."


## Add features columns to the dataframe

In [10]:
# add features columns to the dataframe
f = open("data/keywords.json")
keywords = json.load(f)
keyword_list = []
for list_ in keywords.values():
    keyword_list.extend(list_)

df['keyword'] = df['Text'].apply(lambda txt: ','.join([str(keyword) for keyword in keyword_list \
                                                if (keyword.lower() in str(txt).lower())]))
df.sample(5)                                        

Unnamed: 0,Comments,Likes,Shares,Text,keyword
40,0,2,0,Our clinical statement provides guidance on co...,"Sexual health,sex"
365,0,38,0,"Gender-based violence, including rape and earl...","Gender-based violence,rape,forced marriage"
338,0,8,0,A 2019 undercover investigation by BBC News Ar...,content
2,0,0,0,"""In #Tanzania, 21.7% of women desire to delay ...",Contraception
202,0,2,0,We are ready for the review of Canada under th...,"Sexual health,sex,Gender-based violence"


In [11]:
# # add features columns to the dataframe
# f = open("data/keywords.json")
# keywords = json.load(f)

# for (key, value_list) in keywords.items():
#     df[key] = df['Text'].apply(lambda x: ','.join([str(value) for value in value_list if (value.lower() in str(x).lower())]))
# df.sample(5)

In [12]:
# drop row with empty keyword list
df = df[df['keyword'] != '']
df.shape

(224, 5)

In [13]:
df['Likes'].describe()

count    224.000000
mean      12.321429
std       25.755391
min        0.000000
25%        1.000000
50%        3.000000
75%       12.000000
max      214.000000
Name: Likes, dtype: float64

In [14]:
# label the rows based on the likes value
mean = df['Likes'].mean()
df['label'] = df['Likes'].apply(lambda x: 1 if x >= mean else 0)
df.sample(5)

Unnamed: 0,Comments,Likes,Shares,Text,keyword,label
123,0,1,0,Estimating the market size for a dual preventi...,"content,Contraception",0
143,0,0,0,Anxiety about separation from their partner du...,"Reproductive health,sex",0
276,4,18,0,NAMIBIA RATIFIED C190! Becoming the third coun...,"sex,Gender-based violence",1
218,2,16,0,We're excited to announce the winners of the 2...,"Sexual health,sex",1
4,0,0,0,New research article now online: Measuring unm...,Contraception,0


In [15]:
# drop comments, likes, shares, and text columns
df = df.loc[:, (df.columns !='Comments') & (df.columns !='Likes') & (df.columns !='Shares')]
df.sample(5)

Unnamed: 0,Text,keyword,label
228,Did you know that you can now text the Access ...,"Sexual health,sex",0
189,REMINDER: we have our HIV event tonight!! Chec...,HIV,0
205,We are wrapping up 2020 with an historic victo...,Abortion,1
176,"🎉 Meet the Committee 🎉 The final post!\n\n""Hi!...","Reproductive health,Abortion",0
7,Last chance!! Submit your paper before January...,"Reproductive health,sex",0


In [16]:
df.to_csv('data/cleaned_fb_posts.csv')

# Generate sample data 

In [17]:
# # load the json keywords
# f = open("data/keywords.json")
# keywords = json.load(f)
# # keywords

In [18]:
# socialMedia = ['social media used', 'digital platforms used', 'content', 'discussion points',
#               'frequency of use', 'number of users', 'users’ profile', 'popularity']
# youth = ['Youth', 'Young people', 'Adolescent', 'Youth',
#         'Youth', 'Young people', 'Adolescent', 'Youth']
# GBV = ['Gender-based violence', 'rape', 'sexual harassment', 'Gender-based violence',
#       'forced marriage', 'Gender-based violence', 'rape', 'sexual harassment']
# label = [1, 1, 0, 0, 1, 0, 1, 1]

# df = pd.DataFrame({'socialMedia': socialMedia, 'youth': youth, 'GBV': GBV, 'label': label})
# df

# Exploratory Analysis

In [19]:
# def eda_categorical(data, variable, ax=None):
#     variable_df = data[variable].value_counts(normalize=True).reset_index()
#     n_colors = len(variable_df)
#     variable_df.set_index('index').T.plot(kind='barh',
#                                         stacked=True,
#                                         colormap=ListedColormap(sns.color_palette("Set2", n_colors)),
#                                         width=0.15, ax=ax)

# def multiple_eda_categorical(data, list_categorical):
#     n_rows = math.ceil(len(list_categorical)/2)
#     fig = plt.figure(figsize=(12,n_rows*3))

#     for i, variable in enumerate(list_categorical):
#         ax = fig.add_subplot(n_rows,2,i+1)
#         eda_categorical(data, variable, ax=ax)
    
#     plt.tight_layout()
#     plt.show()

# def multiple_eda_continuous(data, list_continuous):
#     n_rows = math.ceil(len(list_continuous)/3)
#     fig = plt.figure(figsize=(12,n_rows*5))
#     palette = sns.color_palette('Set2', 3)

#     for i, variable in enumerate(list_continuous):
#         ax = fig.add_subplot(n_rows,3,i+1)
#         sns.boxplot(x=variable, data=data, orient='v', palette=[palette[i]], ax=ax)
#         ax.set_ylabel('')
#         ax.set_title(variable)

#     plt.tight_layout()
#     plt.show()

In [20]:
# from pandas.api.types import is_numeric_dtype

# continuous, categorical = [], []

# for col in df.columns[df.columns != 'label']:
#     if is_numeric_dtype(df[col]):
#         continuous.append(col)
#     else:
#         categorical.append(col)

In [21]:
n = len(df)
n_positive = sum(df['label'])
n_negative = n - n_positive

print('Dataset contains %d records' % (n))
print('Number of post which gets a positive feadback: %d (%0.2f%%)' % (n_positive, n_positive*100/n))

Dataset contains 224 records
Number of post which gets a positive feadback: 54 (24.11%)


In [22]:
# multiple_eda_categorical(df, categorical)