In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import tqdm
from math import ceil

from fitter import Fitter
from scipy.stats import exponnorm, erlang, gennorm
import util

experiments = [
    {'file_column': 'Logits_Negative', 'days_range': (25,26), 'months_range': (1,2), 'hours_range': (0,24)}
]

In [2]:
def get_dist_ks_values(tweets_per_file, days_range, test=False):
    n_tweets = 0
    dist_ks_values = {}
    n_files = 0
    for tweets_hour in tqdm.tqdm(tweets_per_file):
        n_files += 1
        n_tweets += len(tweets_hour)
        table = util.fit_data(tweets_hour, 'summary', test)
        for i, row in table.iterrows():
            if i in dist_ks_values:
                dist_ks_values[i].append(row['ks_pvalue'])
            else:
                dist_ks_values[i] = [row['ks_pvalue']]

        if n_files % 24 == 0:
            c_day = n_files//24
            pd.DataFrame(dist_ks_values).to_pickle('../results/top_distributions_2020_{month:02}_{day:02}.pkl'.format(month=ceil(c_day/31), day=c_day+days_range[0]-1))
            dist_ks_values = {}

    if len(dist_ks_values) > 0:
        pd.DataFrame(dist_ks_values).to_pickle('../results/top_distributions_2020_{}.pkl'.format(n_files/24))

    print(f'Processed {len(tweets_per_file)} files!')
    print(f'Processed {len(tweets_per_file)//24} days!')
    print(f'Processed {n_tweets} tweets!')

    # return pd.DataFrame(dist_ks_values)

In [3]:
for e in experiments:
    stream_ln, tweets_per_file = util.load_data(**e)
    get_dist_ks_values(tweets_per_file, e['days_range'], test=True)

Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.43it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.48it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.78it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.49it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  8.15it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.28it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  3.98it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.33it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.08it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.09it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  4.21it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  3.98it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  3.50it/s]
Fitting 2 distributions: 100%|██████████| 2/2 [00:00<00:00,  3.39it/s]
Fittin

Processed 24 files!
Processed 1 days!
Processed 856089 tweets!





In [4]:
pkls = []
for p in range(22,25):
    read_file = f'../results/top_distributions_2020_01_{p}.pkl'
    pkls.append(pd.read_pickle(read_file))
df_ks_values = pd.concat(pkls, ignore_index=True)

mean_pvalues = {}
for dist in df_ks_values:
    mean_pvalues[dist] = (np.nanmean(df_ks_values[dist]), df_ks_values[dist].isna().sum(), df_ks_values[dist][df_ks_values[dist] >= 0.01].count())

result = pd.DataFrame(mean_pvalues.values(), index=mean_pvalues.keys(), columns=['p_value', 'nan_count', 'greater_equal_.01'])

result.sort_values(inplace=True, by=['greater_equal_.01', 'nan_count', 'p_value'], ascending=[False, True, False])

result.head(20)

Unnamed: 0,p_value,nan_count,greater_equal_.01
gumbel_r,0.034702,1,9
laplace_asymmetric,0.14478,55,9
exponnorm,0.240352,56,9
dgamma,0.131415,57,9
dweibull,0.116537,58,9
erlang,0.208469,60,9
invweibull,0.249026,62,9
johnsonsu,0.418445,63,9
norminvgauss,0.417689,63,9
nct,0.370731,63,9


In [5]:
result = pd.DataFrame(mean_pvalues.values(), index=mean_pvalues.keys(), columns=['p_value', 'nan_count', 'greater_equal_.01'])

result['metrics_mean'] = (((len(df_ks_values) - result['nan_count']) + result['greater_equal_.01'])/len(df_ks_values) + result['p_value'])/3

result.sort_values(inplace=True, by=['metrics_mean'], ascending=[False])

result.head(20)

Unnamed: 0,p_value,nan_count,greater_equal_.01,metrics_mean
gumbel_r,0.0347023,1,9,0.381938
laplace,0.01532109,0,7,0.370848
logistic,0.02820824,2,8,0.370514
norm,0.01321836,0,7,0.370147
uniform,8.007622e-07,0,0,0.333334
expon,1.508877e-11,0,0,0.333333
gumbel_l,0.0003505133,2,0,0.324191
rayleigh,0.003273832,13,2,0.283499
johnsonsu,0.418445,63,9,0.222815
norminvgauss,0.4176886,63,9,0.222563
