In [85]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import tqdm
from math import ceil

from fitter import Fitter
from scipy.stats import exponnorm, erlang, gennorm


days_range = (22,23) #1,32
months_range = (1,2) #1,3
hours_range = (0, 24)

In [86]:
def load_data(column):
    stream = []
    days = range(*days_range)
    months = range(*months_range)
    for month in months:
        month_2_digits = '{month:02}'.format(month=month)
        path = '../COVID19_Tweets_Dataset_2020/Summary_Sentiment/2020_' + month_2_digits + '/'

        for day in days:
            if month == 2 and day > 29:
                continue
            if month == 1 and day < 22:
                continue

            for hour in range(*hours_range):
                file_name = path + f'2020_{month_2_digits}_' + '{day:02}'.format(day=day) + '_{hour:02}'.format(hour=hour) + '_Summary_Sentiment.csv'
                stream.append(pd.read_csv(file_name)[column])

    return (pd.concat(stream, ignore_index=True), stream) if len(stream) > 0 else (stream, stream)

In [87]:
stream_ln, tweets_per_file = load_data('Logits_Negative')

In [88]:
def get_fitted_summary(data):
    f = Fitter(data)
    # f.distributions = f.distributions[:2]
    f.fit()
    return f.summary(method='ks_pvalue', plot=False, clf=False, Nbest=110)

def get_dist_ks_values():
    n_tweets = 0
    dist_ks_values = {}
    n_files = 0
    for tweets_hour in tqdm.tqdm(tweets_per_file):
        n_files += 1
        n_tweets += len(tweets_hour)
        table = get_fitted_summary(tweets_hour)
        for i, row in table.iterrows():
            if i in dist_ks_values:
                dist_ks_values[i].append(row['ks_pvalue'])
            else:
                dist_ks_values[i] = [row['ks_pvalue']]

        if n_files % 24 == 0:
            c_day = n_files//24
            pd.DataFrame(dist_ks_values).to_pickle('../results/top_distributions_2020_{month:02}_{day:02}.pkl'.format(month=ceil(c_day/31), day=c_day+days_range[0]-1))
            dist_ks_values = {}

    if len(dist_ks_values) > 0:
        pd.DataFrame(dist_ks_values).to_pickle('../results/top_distributions_2020_{}.pkl'.format(n_files/24))

    print(f'Processed {len(tweets_per_file)} files!')
    print(f'Processed {len(tweets_per_file)//24} days!')
    print(f'Processed {n_tweets} tweets!')

    # return pd.DataFrame(dist_ks_values)

In [89]:
get_dist_ks_values()

  0%|          | 0/24 [00:00<?, ?it/s]SKIPPED _fit distribution (taking more than 30 seconds)

Fitting 110 distributions:  38%|███▊      | 42/110 [00:05<00:15,  4.32it/s]SKIPPED kstwo distribution (taking more than 30 seconds)
SKIPPED loguniform distribution (taking more than 30 seconds)
Fitting 110 distributions:  72%|███████▏  | 79/110 [00:15<00:13,  2.24it/s]SKIPPED reciprocal distribution (taking more than 30 seconds)
SKIPPED rv_histogram distribution (taking more than 30 seconds)
SKIPPED rv_continuous distribution (taking more than 30 seconds)SKIPPED levy_stable distribution (taking more than 30 seconds)
SKIPPED studentized_range distribution (taking more than 30 seconds)
  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the s

Processed 24 files!
Processed 1 days!
Processed 36333 tweets!


In [90]:
pkls = []
for p in range(22,25):
    read_file = f'../results/top_distributions_2020_01_{p}.pkl'
    pkls.append(pd.read_pickle(read_file))
df_ks_values = pd.concat(pkls, ignore_index=True)

mean_pvalues = {}
for dist in df_ks_values:
    mean_pvalues[dist] = (np.nanmean(df_ks_values[dist]), df_ks_values[dist].isna().sum(), df_ks_values[dist][df_ks_values[dist] >= 0.01].count())

result = pd.DataFrame(mean_pvalues.values(), index=mean_pvalues.keys(), columns=['p_value', 'nan_count', 'greater_equal_.01'])

result.sort_values(inplace=True, by=['greater_equal_.01', 'nan_count', 'p_value'], ascending=[False, True, False])

result.head(20)

Unnamed: 0,p_value,nan_count,greater_equal_.01
gumbel_r,0.034702,1,9
laplace_asymmetric,0.14478,55,9
exponnorm,0.240352,56,9
dgamma,0.131415,57,9
dweibull,0.116537,58,9
erlang,0.208469,60,9
invweibull,0.249026,62,9
johnsonsu,0.418445,63,9
norminvgauss,0.417689,63,9
nct,0.370731,63,9


In [91]:
result = pd.DataFrame(mean_pvalues.values(), index=mean_pvalues.keys(), columns=['p_value', 'nan_count', 'greater_equal_.01'])

result['metrics_mean'] = (((len(df_ks_values) - result['nan_count']) + result['greater_equal_.01'])/len(df_ks_values) + result['p_value'])/3

result.sort_values(inplace=True, by=['metrics_mean'], ascending=[False])

result.head(20)

Unnamed: 0,p_value,nan_count,greater_equal_.01,metrics_mean
gumbel_r,0.0347023,1,9,0.381938
laplace,0.01532109,0,7,0.370848
logistic,0.02820824,2,8,0.370514
norm,0.01321836,0,7,0.370147
uniform,8.007622e-07,0,0,0.333334
expon,1.508877e-11,0,0,0.333333
gumbel_l,0.0003505133,2,0,0.324191
rayleigh,0.003273832,13,2,0.283499
johnsonsu,0.418445,63,9,0.222815
norminvgauss,0.4176886,63,9,0.222563


  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the subranges.  Perhaps a special-purpose integrator should be used.
  quad_r = quad(f, low, high, args=args, full_output=self.full_output,
  quad_r = quad(f, low, high, args=args, full_output=self.full_output,
