# Load Data

In [None]:
import sys
sys.path.append("..")
from src import tools
from ipywidgets import interact, interact_manual
import ipywidgets as widgets
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas_profiling
%matplotlib inline


def absoluteFilePaths(directory):
    for dirpath, _, filenames in os.walk(directory):
        for f in filenames:
            if str(f)[0] == '.':
                pass
            else:
                yield os.path.abspath(os.path.join(dirpath, f))


FOLDER = "../outputs/fb/summary/"


def load_data(folder):
    files = absoluteFilePaths(folder)
    users = []
    data = {}
    for f in files:
        df = pd.read_csv(f)
        user = df.user.iloc[0]
        data.update({user: df})
    return data


data = load_data(FOLDER)
print(data.keys())

# Clean Data

## Drop columns, set datetimeindex, fill missing data and generate statistics.

In [None]:
start_date = '2019-10-15'
end_date = '2020-02-06'

def clean(df):
    df = df[pd.notnull(df['source'])]
    df = df[['impressionOrder', 'id', 'nature',
             'source',  'fblinktype', 'texts',
             'textsize', 'timeline', 'user']]
    return df

stats = pd.DataFrame(columns=['start', 'end',
                              'average_impression_count',
                              'timelines',
                              'total_entries',
                              'missing values'
                              ])

for user, df in data.items():
    try:
        df = df.drop('Unnamed: 0', 1)
    except:
        pass

    df = df[['impressionTime', 'impressionOrder',  'id',
             'nature',  'postId', 'publicationTime', 'permaLink',
             'source', 'sourceLink', 'fblinktype',
             'texts', 'textsize', 'images',
             'semanticCount', 'semanticId', 'timeline', 'user']]

    df.index = df['impressionTime']
    df.index = pd.to_datetime(df.index)
    df = df[end_date:start_date]

    df.publicationTime = pd.to_datetime(df.publicationTime)
    df.impressionOrder = pd.to_numeric(df.impressionOrder, downcast='integer')

    df = df.replace("nan", np.nan)
    df = df.replace("", np.nan)
    df['fblinktype'] = df['fblinktype'].fillna('other')

    # AUTOMATE THIS
    link_source_dict = {
        'https://www.facebook.com/AgenziaANSA/': 'ANSA.it',
        'https://www.facebook.com/AgenziaAdnKronos/': 'Adnkronos',
        'https://www.facebook.com/corrieredellasera/': 'Corriere della Sera',
        'https://www.facebook.com/giorgiameloni.paginaufficiale/': 'Giorgia Meloni',
        'https://www.facebook.com/HuffPostItalia/': 'HuffPost Italia',
        'https://www.facebook.com/ilfoglio/': 'Il Foglio',
        'https://www.facebook.com/ilGiornale/': 'Il Giornale',
        'https://www.facebook.com/italiaviva/': 'Italia Viva',
        'https://www.facebook.com/la7fb/': 'La7',
        'https://www.facebook.com/lastampa.it/': 'La Stampa',
        'https://www.facebook.com/legasalvinipremier/': 'Lega - Salvini Premier',
        'https://www.facebook.com/liberonews/': 'Libero',
        'https://www.facebook.com/LuigiDiMaio/': 'Luigi Di Maio',
        'https://www.facebook.com/matteorenziufficiale/': 'Matteo Renzi',
        'https://www.facebook.com/Messaggero.it/': 'Il Messaggero.it',
        'https://www.facebook.com/partitodemocratico/': 'Partito Democratico',
        'https://www.facebook.com/rainews.it/': 'Rainews.it',
        'https://www.facebook.com/Repubblica/': 'la Repubblica',
        'https://www.facebook.com/salviniofficial/': 'Matteo Salvini'}

    df['source'] = df['source'].fillna(df['sourceLink'])
    df = df.replace({"source": link_source_dict})

    stats.loc[user] = [
        df.impressionTime.min(),
        df.impressionTime.max(),
        # max(df.impressionOrder),
        #min(df.groupby(['timeline'], sort=False)['impressionOrder'].max()),
        df.impressionOrder.mean(),
        df['timeline'].nunique(),
        df['id'].count(),
        df.isna().sum().sum()
    ]
    data[user] = df.copy()

df = pd.concat(data.values())

df = clean(df)
stats

## Replace pseudo-names with "real" user names

In [None]:
user_names = pd.read_csv("../config/users.csv")
names_dict = {}

for name, number, token in user_names.values:
    temp_df = pd.read_csv('../outputs/fb/summary/'+token+'.csv', nrows=2)
    names_dict[temp_df.user[0]] = name

df = df.replace({"user": names_dict})

In [None]:
# data['parsnip-lentils-fudge'].drop('impressionTime',1).profile_report(check_correlation_cramers=False,
#                  check_correlation_pearson=False,
#                  cardinality_threshold=len(df)*0.9,
#                  style={'full_width':True}
#                                          )

# Explore

## Profile Report (takes a while)

In [None]:
#df.profile_report(check_correlation_cramers=False,
#                          check_correlation_pearson=False,
#                          cardinality_threshold=len(df)*0.9,
#                          style={'full_width': True})

## Post Count per column

In [None]:
def postCountGraph(df, user, column):
    data = tools.setDatetimeIndexFloor(tools.setDatetimeIndex(df), what="1D")
    data = data.groupby(data.index)[column].count()
    data = data.to_frame()

    fig, ax = plt.subplots(figsize=(15, 7))

    # set ticks every week
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
    ax.set_title('Number of entries in col '+column +
                 ' collected per day for: '+user)
    ax.bar(data.index, data[column])
    return(fig, ax)


#fig, ax = (dailyPostCountGraph(df, user))

@interact
def show_plot(user=df['user'].unique(), column=df.columns):
    return postCountGraph(data[user], user, column)

## Post Count per Dates

In [None]:
def datesPostCountGraph(df, user):
    data = tools.setDatetimeIndexFloor(df, what="1D")
    data = data.groupby(data.index).id.count()
    data = data.to_frame()

    fig, ax = plt.subplots(figsize=(15, 7))

    # set ticks every week
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
    ax.set_title('Number of posts collected per day for user: '+user)
    ax.bar(data.index, data.id)
    return(fig, ax)


#fig, ax = (dailyPostCountGraph(df, user))

@interact
def show_plot(user=df['user'].unique(), start_date="2019-10-12", end_date="2020-01-03"):
    mask = (df['user'] == user)
    return datesPostCountGraph(df[mask].loc[end_date:start_date], user)

## How many total entries i would have if i exclude days that have less than n entries?

In [None]:
def dailyPostCount(df):
    data = tools.setDatetimeIndexFloor(df, what="1D")
    data = data.groupby([data.index, data.user]).id.count()
    data = data.to_frame()
    return data


count = dailyPostCount(df)

where_to_cut = pd.DataFrame(columns=['total_posts'])
for i in range(0, 400):
    l = len(count[(count['id'] > i)])

    where_to_cut.loc[i] = [
        l
    ]
where_to_cut.plot(title='Total posts collected against number of entries per')

## Generate stats (missing values, timeframes, timelines..) per user

In [None]:
stats = pd.DataFrame(columns=['start', 'end',
                              'average_impression_count',
                              'timelines',
                              'total_entries',
                              'missing values'
                              ])


def getStats(df):
    user_list = df.user.unique()
    for user in user_list:
        data = df[(df['user'] == user)]

        stats.loc[user] = [
            data.index.min(),
            data.index.max(),
            # max(df.impressionOrder),
            #min(df.groupby(['timeline'], sort=False)['impressionOrder'].max()),
            data.impressionOrder.mean(),
            data['timeline'].nunique(),
            data['id'].count(),
            data.isna().sum().sum()
        ]
    return stats


stats = getStats(df)

stats

## Distribution of total entries collected per timeline

In [None]:
df.groupby(['timeline'])['impressionOrder'].max().plot.hist(bins=16)

In [None]:
result = df
result.index = result.index.floor('H')
names = result.groupby(df.index).user.unique()
timelines = result.groupby(df.index).timeline.unique()
result = pd.DataFrame(names)
result.columns = ['names']
result['users_count'] = result['names'].str.len()
result['timelines'] = timelines
result['timelines_count'] = result['timelines'].str.len()
display(result)

# result.users_count.plot.hist(bins=5)
result.timelines_count.plot.hist(bins=31)

## Distribution of number of timelines per hour

In [None]:
result.timelines_count.plot.hist(
    bins=31, title='distribution of number of timelines  collected per hour')

## Distribution of number of active (collecting data) users per hour

In [None]:
result.users_count.plot.hist(
    bins=20, title='distribution of number of users active per hour')

## How many hours of data do i get if i only select hours where at least n users were collecting data?

In [None]:
print('timelines seen by all users: ' +
      str(len(result[(result['users_count'] == 20) & (result['timelines_count'] >= 20)]))+' hours')

print('timelines seen by at least 16 users: ' +
      str(len(result[(result['users_count'] >= 16) & (result['timelines_count'] >= 16)]))+' hours')

print('timelines seen by at least 10 users: ' +
      str(len(result[(result['users_count'] >= 10) & (result['timelines_count'] >= 10)]))+' hours')

print('all data: '+str(len(result))+' hours')

## How many hours per day have been collected by all the users?

### All data in timeframe

In [None]:
trimmed = result[result['users_count'] >= 16]
trimmed = trimmed.loc['2019-11-01':'2019-12-31']
#trimmed = trimmed.loc['2019-12-31':'2019-11-01']

display(trimmed)
trimmed.users_count.groupby(trimmed.index.floor(
    'D')).count().plot.bar(title='All data in timeframe')

### Pre-polarisation

In [None]:
trimmed = result[(result['users_count'] >= 20)]
trimmed = trimmed.loc['2019-11-02':'2019-11-19']
trimmed.users_count.groupby(trimmed.index.floor(
    'D')).count().plot.bar(title='Pre-polarisation')

### During-polarisation

In [None]:
trimmed = result[(result['users_count'] == 20)]
trimmed = trimmed.loc['2019-11-20':'2019-12-05']
trimmed.users_count.groupby(trimmed.index.floor(
    'D')).count().plot.bar(title='During-polarisation')

### Polarized without external pages

In [None]:
trimmed = result[(result['users_count'] == 20)]
trimmed = trimmed.loc['2019-12-05':'2019-12-10']
trimmed.users_count.groupby(trimmed.index.floor('D')).sum().plot.bar(
    title='Polarized without external pages')

### Post-polarisation and post-external-likes

In [None]:
trimmed = result[(result['users_count'] == 20)]
trimmed = trimmed.loc['2019-12-10':'2019-12-27']
trimmed.users_count.groupby(trimmed.index.floor('D')).count().plot.bar(
    title='Post-polarisation and post-external-likes')

# Preparing the Data

## Remove all the timelines that have less than 10 entries

In [None]:
df_wip = df.loc['2019-11-01':'2019-12-31']
timelines = df_wip.groupby('timeline')['impressionOrder'].max()

print('Before: {}'.format(len(df_wip)))
df_wip = df_wip[df_wip.timeline.isin(list(timelines[timelines > 10].index))]
print('After: {}'.format(len(df_wip)))

In order to understand what is the best combination of users which will provide the longest dataset possible given a minimun number of users, we must:
- generate list of lists with the possible combinations >= of 16 elements (8 per group)
- filter df and check len by combination
- output top 5 combinations with len

## What is the combination of 16 or more users i should use to keep as much data as possible?

In [None]:
result.names = result.names.apply(sorted)
result = result[result['users_count'] >= 16]
result = result.loc['2019-11-01':'2019-12-31']


from IPython.display import clear_output, display
import itertools


def user_combinations(trimmed=trimmed, n_users=16):
    list_of_all_users_name = sorted(trimmed.names.iloc[0])
    print(list_of_all_users_name)

    combinations = set(
        list(itertools.combinations(list_of_all_users_name, n_users)))

    d = {}
    n = 0
    for i in combinations:
        n += 1
        clear_output(wait=True)
        print(str(n)+'/'+str(len(combinations)))
        for index, row in result.iterrows():
            check = all(item in row.names for item in i)
            if check:
                nam = str(i)
                if nam in d:
                    d[nam] += 1
                else:
                    d[nam] = 1
            else:
                pass
    user_combinations_dict = {k: v for k, v in sorted(
        d.items(), key=lambda item: item[1], reverse=True)}

    return user_combinations_dict


#user_combinations_dict = user_combinations()
# list(user_combinations_dict.items())[0]

Now we know that the best combination of users is "('Aarend', 'Bjoke', 'Cor Timmerman', 'Doortje', 'Erwijn', 'Hanneke', 'Juultje', 'Kris', 'Lonneke', 'Martijn', 'Nienke', 'Omar', 'Phoebe', 'Quintijn', 'Stephan', 'Terese')". We want a list of these timelines in order to filter the dataframe and only keep those.

In [None]:
filter_lst = ['Aarend', 'Bjoke', 'Cor Timmerman', 'Doortje', 'Erwijn', 'Hanneke', 'Juultje',
              'Kris', 'Lonneke', 'Martijn', 'Nienke', 'Omar', 'Phoebe', 'Quintijn', 'Stephan', 'Terese']

timelines_to_keep = []

for index, row in result.iterrows():
    if all(n in row.names for n in filter_lst):
        timelines_to_keep.append(list(row.timelines))
    else:
        pass

timelines_to_keep = [item for sublist in timelines_to_keep for item in sublist]

## Keep only timelines with appropriate combination of users

In [None]:
print('Before: {}'.format(len(df_wip)))
df_wip = df_wip[df_wip.timeline.isin(timelines_to_keep)]
print('After: {}'.format(len(df_wip)))

In [None]:
#df_wip = df_wip.replace({"user": names_dict})

In [None]:
sources_to_remove = ['Jair Messias Bolsonaro',
                     'Donald J. Trump', 'Marine Le Pen']  # E TRUMP
print('Before: {}'.format(len(df_wip)))
df_wip = df_wip[~df_wip.source.isin(sources_to_remove)]
print('After: {}'.format(len(df_wip)))

In [None]:
users_to_keep = ['Bjoke', 'Cor Timmerman', 'Doortje', 'Erwijn', 'Hanneke', 'Juultje',
                 'Kris', 'Lonneke', 'Martijn', 'Nienke', 'Omar', 'Phoebe', 'Quintijn', 'Stephan']
print('Before: {}'.format(len(df_wip)))
df_wip = df_wip[df_wip['user'].isin(users_to_keep)]
print('After: {}'.format(len(df_wip)))

In [None]:
df_wip.to_csv('dataset.csv')

# Normalize and Plot

In [None]:
from collections import defaultdict
import os
from datetime import datetime
from scipy.stats import ttest_ind
from numpy import mean
# set constants

sources_dx = ["Matteo Salvini"]
sources_sx = ["Matteo Renzi"]

start_collection = '2019-11-10'
start_polarization = '2019-11-20'


timerange = ('2019-11-30',
             '2019-12-03')


folder = '../outputs/fb/summary'
file_users = '../config/users.csv'

In [None]:
impressions = df_wip[['user', 'source']]

# filter out for desired time range for experiment


impressions_exp = impressions[(impressions.index >= timerange[0])
                              & (impressions.index <= timerange[1])]

impressions_pre = impressions[(impressions.index <= start_polarization) & (
    impressions.index >= start_collection)]

# count specific sources per user


def count_sources(impressions, sources):
    sources_users = {"source": [],
                     "user": [],
                     "count": []}
    for source in sources:
        for user, user_impressions in impressions.groupby('user'):
            sources_users["source"].append(source)
            sources_users["user"].append(user)
            sources_users["count"].append(
                user_impressions[user_impressions["source"] == source]["source"].count())
    return pd.DataFrame(data=sources_users)


def users_dx_and_sx(sources_dx, sources_sx, impressions):
    sources_users = count_sources(impressions, sources_dx)
    sources_users = sources_users[['user', 'count']]
    sources_users.columns = ['user', 'dx']
    sources_users['sx'] = count_sources(impressions, sources_sx)['count']
    return sources_users


sources_users_exp = users_dx_and_sx(sources_dx, sources_sx, impressions_exp)
sources_users_pre = users_dx_and_sx(sources_dx, sources_sx, impressions_pre)

sources_users_exp['ratio_dx-sx'] = sources_users_exp['dx'] / \
    sources_users_exp['sx']

#sources_users_exp['ratio_dx-sx_adj'] = (sources_users_exp['dx']/sources_users_pre['dx'])/(sources_users_exp['sx']/sources_users_pre['sx'])
sources_users_exp['ratio_dx-sx_adj'] = (sources_users_exp['dx'].divide(
    sources_users_pre['dx']))/(sources_users_exp['sx'].divide(sources_users_pre['sx']))

# add column with test group the user belongs to
polarized_users = ('Aarend', 'Bjoke', 'Cor Timmerman', 'Doortje',
                   'Erwijn', 'Friedie', 'Gert', 'Hanneke', 'Ivo', 'Juultje')

sources_users_exp["user_group"] = ["polarized" if user in polarized_users
                                   else "control"
                                   for user in sources_users_exp["user"]]


# compare means and run t-test
adjusted = False
if adjusted:
    testing_var = 'ratio_dx-sx_adj'
else:
    testing_var = 'ratio_dx-sx'

polarized = sources_users_exp[sources_users_exp['user_group']
                              == 'polarized'][testing_var]  # or _adj
control = sources_users_exp[sources_users_exp['user_group']
                            == 'control'][testing_var]


print(mean(polarized))
print(mean(control))

statistic, pvalue = ttest_ind(polarized, control)

print('Statistic: {}'.format(statistic))
print('Pvalue: {}'.format(pvalue))

In [None]:
import seaborn as sns
%matplotlib inline

title = 'Right/Left count ratio'

if adjusted:
    title = title+" - Adjusted"
else:
    pass

sns.set(style="ticks", color_codes=True)
groups_data = sources_users_exp[[testing_var, "user_group"]]
sns.catplot(x="user_group", y=testing_var, data=groups_data).fig.suptitle(title, fontsize=10)

In [None]:
display(groups_data)