In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import tqdm

from fitter import Fitter
from scipy.stats import exponnorm, erlang, gennorm


In [2]:
def load_data(column):
    stream = []
    days = range(1, 27) # 1, 32
    months = range(1, 2) #1, 3
    for month in months:
        month_2_digits = '{month:02}'.format(month=month)
        path = '../COVID19_Tweets_Dataset_2020/Summary_Sentiment/2020_' + month_2_digits + '/'

        for day in days:
            if month == 2 and day > 29:
                continue
            if month == 1 and day < 22:
                continue

            for hour in range(24): # 24
                file_name = path + f'2020_{month_2_digits}_' + '{day:02}'.format(day=day) + '_{hour:02}'.format(hour=hour) + '_Summary_Sentiment.csv'
                stream.append(pd.read_csv(file_name)[column])

    return (pd.concat(stream, ignore_index=True), stream) if len(stream) > 0 else (stream, stream)

In [3]:
stream_ln, tweets_per_file = load_data('Logits_Negative')

In [4]:
def get_fitted_summary(data):
    f = Fitter(data)
    # f.distributions = f.distributions[:3]
    f.fit()
    return f.summary(method='ks_pvalue', plot=False, clf=False, Nbest=110)

def get_dist_ks_values():
    n_tweets = 0
    dist_ks_values = {}
    n_files = 0
    for tweets_hour in tqdm.tqdm(tweets_per_file):
        n_files += 1
        n_tweets += len(tweets_hour)
        table = get_fitted_summary(tweets_hour)
        for i, row in table.iterrows():
            if i in dist_ks_values:
                dist_ks_values[i].append(row['ks_pvalue'])
            else:
                dist_ks_values[i] = [row['ks_pvalue']]

        if n_files % 24 == 0:
            pd.DataFrame(dist_ks_values).to_pickle('../results/top_distributions_day_{}.pkl'.format(n_files//24))
            dist_ks_values = {}

    if len(dist_ks_values) > 0:
        pd.DataFrame(dist_ks_values).to_pickle('../results/top_distributions_day_{}.pkl'.format(n_files/24))

    print(f'Processed {len(tweets_per_file)} files!')
    print(f'Processed {len(tweets_per_file)//24} days!')
    print(f'Processed {n_tweets} tweets!')

    # return pd.DataFrame(dist_ks_values)

In [5]:
get_dist_ks_values()

  0%|          | 0/120 [00:00<?, ?it/s]SKIPPED _fit distribution (taking more than 30 seconds)

SKIPPED kstwo distribution (taking more than 30 seconds)SKIPPED loguniform distribution (taking more than 30 seconds)

Fitting 110 distributions:  73%|███████▎  | 80/110 [00:14<00:13,  2.27it/s]SKIPPED reciprocal distribution (taking more than 30 seconds)
SKIPPED rv_continuous distribution (taking more than 30 seconds)
Fitting 110 distributions:  74%|███████▎  | 81/110 [00:14<00:10,  2.64it/s]SKIPPED rv_histogram distribution (taking more than 30 seconds)SKIPPED levy_stable distribution (taking more than 30 seconds)
SKIPPED studentized_range distribution (taking more than 30 seconds)

  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the

In [3]:
pkls = []
for p in range(1,3):
    read_file = f'../results/top_distributions_day_{p}.pkl'
    pkls.append(pd.read_pickle(read_file))
df_ks_values = pd.concat(pkls, ignore_index=True)

mean_pvalues = {}
for dist in df_ks_values:
    mean_pvalues[dist] = (np.nanmean(df_ks_values[dist]), df_ks_values[dist].isna().sum())
mean_pvalues = dict(sorted(mean_pvalues.items(), key=lambda x:x[1][0], reverse=True))
mean_pvalues

  mean_pvalues[dist] = (np.nanmean(df_ks_values[dist]), df_ks_values[dist].isna().sum())


{'johnsonsu': (0.4184450261403081, 39),
 'norminvgauss': (0.41768855600666793, 39),
 'genhyperbolic': (0.39888860450127184, 39),
 'fisk': (0.3971200230652554, 39),
 'genlogistic': (0.3930355153854472, 39),
 'burr12': (0.3904928400040852, 39),
 'burr': (0.386549213222448, 40),
 'nct': (0.370730738615346, 39),
 'alpha': (0.32159445986396185, 39),
 'invgamma': (0.3119981384673507, 39),
 'powerlognorm': (0.30619956533112613, 39),
 'betaprime': (0.30599643082796363, 39),
 'skewnorm': (0.3026532308209264, 39),
 'lognorm': (0.30115885217044575, 39),
 'exponnorm': (0.29581768513626894, 35),
 'johnsonsb': (0.29576867701474074, 39),
 'gengamma': (0.29417694873256994, 39),
 'fatiguelife': (0.29372081970181196, 39),
 'invgauss': (0.2846320009046507, 39),
 'genextreme': (0.2829066628853462, 39),
 'invweibull': (0.2766956490527307, 39),
 'pearson3': (0.27494719355345193, 39),
 'beta': (0.26787079604048336, 39),
 'mielke': (0.25949825324394166, 39),
 'geninvgauss': (0.2503522323445762, 39),
 'exponwe