In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from fitter import Fitter
from scipy.stats import exponnorm, erlang, gennorm


In [41]:
def load_data(column):
    stream = []
    days = range(1, 23) # 1, 32
    months = range(1, 2) #1, 3
    for month in months:
        month_2_digits = '{month:02}'.format(month=month)
        path = './COVID19_Tweets_Dataset_2020/Summary_Sentiment/2020_' + month_2_digits + '/'

        for day in days:
            if month == 2 and day > 29:
                continue
            if month == 1 and day < 22:
                continue

            for hour in range(3): # 24
                file_name = path + f'2020_{month_2_digits}_' + '{day:02}'.format(day=day) + '_{hour:02}'.format(hour=hour) + '_Summary_Sentiment.csv'
                stream.append(pd.read_csv(file_name)[column])

    return pd.concat(stream, ignore_index=True), stream

In [42]:
stream_ln, tweets_per_file = load_data('Logits_Negative')

In [43]:
def get_fitted_summary(data):
    f = Fitter(data)
    # f.distributions = f.distributions[:5]
    f.fit()
    return f.summary(method='ks_pvalue', plot=False, clf=False, Nbest=110)

def get_dist_mean_ks_value():
    n_tweets = 0
    ks_values = []
    dist_ks_values = {}
    for tweets_hour in tweets_per_file:
        n_tweets += len(tweets_hour)
        table = get_fitted_summary(tweets_hour)
        for i, row in table.iterrows():
            if i in dist_ks_values:
                dist_ks_values[i].append(row['ks_pvalue'])
            else:
                dist_ks_values[i] = [row['ks_pvalue']]

    for dist in dist_ks_values:
        dist_ks_values[dist] = np.mean(dist_ks_values[dist])

    print(f'Processed {len(tweets_per_file)} files!')
    print(f'Processed {n_tweets} tweets!')

    return dist_ks_values

In [44]:
mean_pvalues = get_dist_mean_ks_value()
mean_pvalues = sorted(mean_pvalues.items(), key=lambda x:x[1], reverse=True)

Fitting 110 distributions:   0%|          | 0/110 [00:00<?, ?it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 110 distributions:  44%|████▎     | 48/110 [00:05<00:08,  7.28it/s]SKIPPED kstwo distribution (taking more than 30 seconds)
Fitting 110 distributions:  54%|█████▎    | 59/110 [00:07<00:12,  3.94it/s]SKIPPED loguniform distribution (taking more than 30 seconds)
Fitting 110 distributions:  73%|███████▎  | 80/110 [00:13<00:12,  2.41it/s]SKIPPED reciprocal distribution (taking more than 30 seconds)
Fitting 110 distributions:  74%|███████▎  | 81/110 [00:14<00:10,  2.78it/s]SKIPPED rv_continuous distribution (taking more than 30 seconds)
SKIPPED rv_histogram distribution (taking more than 30 seconds)
Fitting 110 distributions:  96%|█████████▋| 106/110 [00:30<00:08,  2.17s/it]SKIPPED levy_stable distribution (taking more than 30 seconds)
Fitting 110 distributions:  97%|█████████▋| 107/110 [00:36<00:09,  3.08s/it]SKIPPED studentized_range distribution (taking more t

Processed 3 files!
Processed 418 tweets!





In [45]:
mean_pvalues

[('burr', 0.37015437502556514),
 ('gumbel_r', 0.35888865939808623),
 ('johnsonsu', 0.35577066268221075),
 ('invweibull', 0.35503833030587445),
 ('burr12', 0.3276003921931491),
 ('alpha', 0.32185737674289583),
 ('fisk', 0.31790786175722846),
 ('exponnorm', 0.31623074094524567),
 ('norminvgauss', 0.3114729052799758),
 ('genlogistic', 0.3080664507004077),
 ('invgauss', 0.29899906183688635),
 ('invgamma', 0.29506196433705945),
 ('powerlognorm', 0.2895207494024023),
 ('geninvgauss', 0.2849773936949238),
 ('recipinvgauss', 0.2797260175033261),
 ('lognorm', 0.2777650109510236),
 ('exponweib', 0.2775878367041207),
 ('foldcauchy', 0.27656498506571453),
 ('genextreme', 0.2763582893274428),
 ('fatiguelife', 0.27273338874936026),
 ('betaprime', 0.26976730275595634),
 ('skewnorm', 0.2662831708535041),
 ('johnsonsb', 0.26351761726187733),
 ('gamma', 0.26010595372009376),
 ('erlang', 0.26007412489804366),
 ('nct', 0.2600250290689049),
 ('pearson3', 0.25992923006432855),
 ('skewcauchy', 0.258793329074

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,
