# Data Cleaning (for facebook summary)

## Load Data

In [None]:
from ipywidgets import interact, interact_manual
import ipywidgets as widgets
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas_profiling
import sys
sys.path.append("..")
from src import tools

%matplotlib inline


def absoluteFilePaths(directory):
    for dirpath, _, filenames in os.walk(directory):
        for f in filenames:
            if str(f)[0] == '.':
                pass
            else:
                yield os.path.abspath(os.path.join(dirpath, f))


folder = "../outputs/fb/summary/"
files = absoluteFilePaths(folder)

df_list = []
users = []
data = {}
for f in files:
    df = pd.read_csv(f)
    user = df.user.iloc[0]
    data.update({user: df})

print(data.keys())

## Clean Data

In [None]:
stats = pd.DataFrame(columns=['start', 'end',
                              'average_impression_count',
                              'timelines',
                              'total_entries',
                              'missing values'
                              ])

for user, df in data.items():
    try:
        df = df.drop('Unnamed: 0', 1)
    except:
        pass

    df = df[['impressionTime', 'impressionOrder',  'id',
             'nature',  'postId', 'publicationTime', 'permaLink',
             'source', 'sourceLink', 'fblinktype',
             'texts', 'textsize', 'images',
             'semanticCount', 'semanticId', 'timeline', 'user']]

    df.index = df['impressionTime']
    df.index = pd.to_datetime(df.index)
    # end date, start date; format: YYYY-MM-DD
    df = df['2020-02-06':'2019-10-15']
    #df = df['2019-02-06':'2019-10-15']

    df.publicationTime = pd.to_datetime(df.publicationTime)
    df.impressionOrder = pd.to_numeric(df.impressionOrder, downcast='integer')

    df = df.replace("nan", np.nan)
    df = df.replace("", np.nan)
    df['fblinktype'] = df['fblinktype'].fillna('other')

    # AUTOMATE THIS
    dictionary = {
        'https://www.facebook.com/AgenziaANSA/': 'ANSA.it',
        'https://www.facebook.com/AgenziaAdnKronos/': 'Adnkronos',
        'https://www.facebook.com/corrieredellasera/': 'Corriere della Sera',
        'https://www.facebook.com/giorgiameloni.paginaufficiale/': 'Giorgia Meloni',
        'https://www.facebook.com/HuffPostItalia/': 'HuffPost Italia',
        'https://www.facebook.com/ilfoglio/': 'Il Foglio',
        'https://www.facebook.com/ilGiornale/': 'Il Giornale',
        'https://www.facebook.com/italiaviva/': 'Italia Viva',
        'https://www.facebook.com/la7fb/': 'La7',
        'https://www.facebook.com/lastampa.it/': 'La Stampa',
        'https://www.facebook.com/legasalvinipremier/': 'Lega - Salvini Premier',
        'https://www.facebook.com/liberonews/': 'Libero',
        'https://www.facebook.com/LuigiDiMaio/': 'Luigi Di Maio',
        'https://www.facebook.com/matteorenziufficiale/': 'Matteo Renzi',
        'https://www.facebook.com/Messaggero.it/': 'Il Messaggero.it',
        'https://www.facebook.com/partitodemocratico/': 'Partito Democratico',
        'https://www.facebook.com/rainews.it/': 'Rainews.it',
        'https://www.facebook.com/Repubblica/': 'la Repubblica',
        'https://www.facebook.com/salviniofficial/': 'Matteo Salvini'}

    df['source'] = df['source'].fillna(df['sourceLink'])
    df = df.replace({"source": dictionary})

    stats.loc[user] = [
        df.impressionTime.min(),
        df.impressionTime.max(),
        # max(df.impressionOrder),
        #min(df.groupby(['timeline'], sort=False)['impressionOrder'].max()),
        df.impressionOrder.mean(),
        df['timeline'].nunique(),
        df['id'].count(),
        df.isna().sum().sum()
    ]
    data[user] = df.copy()


stats

In [None]:
#data['parsnip-lentils-fudge'].drop('impressionTime',1).profile_report(check_correlation_cramers=False,
#                  check_correlation_pearson=False,
#                  cardinality_threshold=len(df)*0.9,
#                  style={'full_width':True}
#                                          )

### Check how many posts have been collected per day

In [None]:
df = pd.concat(data.values())


def dailyPostCountGraph(df, user):
    data = tools.setDatetimeIndexFloor(tools.setDatetimeIndex(df), what="1D")
    data = data.groupby(data.index).id.count()
    data = data.to_frame()

    fig, ax = plt.subplots(figsize=(15, 7))

    # set ticks every week
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
    ax.set_title('Number of posts collected per day for user: '+user)
    ax.bar(data.index, data.id)
    return(fig, ax)


#fig, ax = (dailyPostCountGraph(df, user))

@interact
def show_plot(user=df['user'].unique()):
    return dailyPostCountGraph(data[user], user)

In [None]:
def dailyPostCountGraph2(df, user, column):
    data = tools.setDatetimeIndexFloor(tools.setDatetimeIndex(df), what="1D")
    data = data.groupby(data.index)[column].count()
    data = data.to_frame()

    fig, ax = plt.subplots(figsize=(15, 7))

    # set ticks every week
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
    ax.set_title('Number of entries in col '+column +
                 ' collected per day for: '+user)
    ax.bar(data.index, data[column])
    return(fig, ax)


#fig, ax = (dailyPostCountGraph(df, user))

@interact
def show_plot(user=df['user'].unique(), column=df.columns):
    return dailyPostCountGraph2(data[user], user, column)

In [None]:
def clean(df):
    #df = df[pd.notnull(df['postId'])]
    df = df[pd.notnull(df['source'])]
    df = df[['impressionOrder', 'id', 'nature',
             'source', 'fblinktype',
             'texts', 'textsize',
             'timeline', 'user']]
    return df


cleaned_df = clean(df)

display(cleaned_df)

In [None]:
#cleaned_df.profile_report(check_correlation_cramers=False,
#                          check_correlation_pearson=False,
#                          cardinality_threshold=len(df)*0.9,
#                          style={'full_width': True})

In [None]:
def datesPostCountGraph(df, user):
    data = tools.setDatetimeIndexFloor(df, what="1D")
    data = data.groupby(data.index).id.count()
    data = data.to_frame()

    fig, ax = plt.subplots(figsize=(15, 7))

    # set ticks every week
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
    ax.set_title('Number of posts collected per day for user: '+user)
    ax.bar(data.index, data.id)
    return(fig, ax)


#fig, ax = (dailyPostCountGraph(df, user))

@interact
def show_plot(user=cleaned_df['user'].unique(), start_date="2019-10-12", end_date="2020-01-03"):
    mask = (cleaned_df['user'] == user)
    return datesPostCountGraph(cleaned_df[mask].loc[end_date:start_date], user)

In [None]:
import altair as alt
alt.renderers.enable('notebook')

# who has more missing days?


def dailyPostCount(df):
    data = tools.setDatetimeIndexFloor(df, what="1D")
    data = data.groupby([data.index, data.user]).id.count()
    data = data.to_frame()
    return data


count = dailyPostCount(cleaned_df)


len(count[(count['id'] > 60)])

where_to_cut = pd.DataFrame(columns=['total_posts'])
for i in range(0, 400):
    l = len(count[(count['id'] > i)])

    where_to_cut.loc[i] = [

        l
    ]
where_to_cut.plot()

In [None]:
stats = pd.DataFrame(columns=['start', 'end',
                              'average_impression_count',
                              'timelines',
                              'total_entries',
                              'missing values'
                              ])


def getStats(df):
    user_list = df.user.unique()
    for user in user_list:
        data = df[(df['user'] == user)]

        stats.loc[user] = [
            data.index.min(),
            data.index.max(),
            # max(df.impressionOrder),
            #min(df.groupby(['timeline'], sort=False)['impressionOrder'].max()),
            data.impressionOrder.mean(),
            data['timeline'].nunique(),
            data['id'].count(),
            data.isna().sum().sum()
        ]
    return stats


stats = getStats(cleaned_df)

stats

In [None]:
stats.start.max(), stats.end.min()

In [None]:
polarized = ['kale-sushi-eggs',
             'pickles-coleslaw-rhubarb',
             'parsnip-lentils-fudge',
             'yams-pomelo-guava',
             'eggs-macaroon-pretzel',
             'croissant-watercress-pudding',
             'yolk-carnitas-date',
             'asparagus-croissant-kiwi',
             'coleslaw-ceviche-broccoli'
             'coconut-fudge-mandarin'
             ]

In [None]:
df.groupby(['timeline'])['impressionOrder'].max().plot.hist(bins=16)

In [None]:
df = df[['impressionOrder', 'id', 'nature', 'postId', 'publicationTime',
         'permaLink', 'source', 'sourceLink', 'fblinktype', 'texts',
         'textsize', 'semanticCount', 'semanticId', 'timeline', 'user']]

In [None]:
user_names = pd.read_csv("../config/users.csv")
names_dict = {}

for name, number, token in user_names.values:
    temp_df = pd.read_csv('../outputs/fb/summary/'+token+'.csv', nrows=2)
    names_dict[temp_df.user[0]] = name

df = df.replace({"user": names_dict})

In [None]:

result = df
result.index = result.index.floor('H')
names = result.groupby(df.index).user.unique()
timelines = result.groupby(df.index).timeline.unique()
result = pd.DataFrame(names)
result.columns = ['names']
result['users_count'] = result['names'].str.len()
result['timelines'] = timelines
result['timelines_count'] = result['timelines'].str.len()
display(result)

# result.users_count.plot.hist(bins=5)
result.timelines_count.plot.hist(bins=31)

In [None]:
result.users_count.plot.hist(
    bins=5, title='distribution of total impression')

In [None]:

print('if timeline 20 and users 20: '+str(len(result[( result['users_count'] == 20 ) & ( result['timelines_count'] == 20 )]))+' hours')

print('if only users are 20 and timelines >= 20: '+str(len(result[( result['users_count'] == 20 ) & ( result['timelines_count'] >= 20 )]))+' hours')

print('if users and timeline >=16: '+str(len(result[( result['users_count'] >= 16 ) & ( result['timelines_count'] >= 16 )]))+' hours')

print('if users and timeline >=10: '+str(len(result[( result['users_count'] >= 10 ) & ( result['timelines_count'] >= 10 )]))+' hours')

print('all data: '+str(len(result))+' hours')

In [None]:
result[( result['users_count'] == 20 ) & ( result['timelines_count'] >= 20 )].groupby(result[( result['users_count'] == 20 ) & ( result['timelines_count'] >= 20 )].index.floor('D')).sum().plot(kind='bar')

In [None]:
trimmed = result[(result['users_count'] == 20)]
trimmed = trimmed.loc['2019-11-01':'2019-12-31']
trimmed.users_count.groupby(trimmed.index.floor('D')).count().plot.bar(title='All data in timeframe')


In [None]:
trimmed = result[(result['users_count'] == 20)]
trimmed = trimmed.loc['2019-11-02':'2019-11-19']
trimmed.users_count.groupby(trimmed.index.floor('D')).count().plot.bar(title='Pre-polarisation')


In [None]:
trimmed = result[(result['users_count'] == 20)]
trimmed = trimmed.loc['2019-11-20':'2019-12-05']
trimmed.users_count.groupby(trimmed.index.floor('D')).count().plot.bar(title='During-polarisation')


In [None]:
trimmed = result[(result['users_count'] == 20)]
trimmed = trimmed.loc['2019-12-05':'2019-12-10']
trimmed.users_count.groupby(trimmed.index.floor('D')).count().plot.bar(title='Polarized without external pages')


In [None]:
trimmed = result[(result['users_count'] == 20)]
trimmed = trimmed.loc['2019-12-10':'2019-12-27']
trimmed.users_count.groupby(trimmed.index.floor('D')).count().plot.bar(title='Post-polarisation and post-external-likes')
