In [92]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import tqdm

from fitter import Fitter
from scipy.stats import exponnorm, erlang, gennorm


In [93]:
def load_data(column):
    stream = []
    days = range(1, 32) # 1, 32
    months = range(1, 2) #1, 3
    for month in months:
        month_2_digits = '{month:02}'.format(month=month)
        path = '../COVID19_Tweets_Dataset_2020/Summary_Sentiment/2020_' + month_2_digits + '/'

        for day in days:
            if month == 2 and day > 29:
                continue
            if month == 1 and day < 22:
                continue

            for hour in range(24): # 24
                file_name = path + f'2020_{month_2_digits}_' + '{day:02}'.format(day=day) + '_{hour:02}'.format(hour=hour) + '_Summary_Sentiment.csv'
                stream.append(pd.read_csv(file_name)[column])

    return pd.concat(stream, ignore_index=True), stream

In [94]:
stream_ln, tweets_per_file = load_data('Logits_Negative')

In [95]:
def get_fitted_summary(data):
    f = Fitter(data)
    f.distributions = f.distributions[:3]
    f.fit()
    return f.summary(method='ks_pvalue', plot=False, clf=False, Nbest=110)

def get_dist_ks_values():
    n_tweets = 0
    dist_ks_values = {}
    n_files = 0
    for tweets_hour in tqdm.tqdm(tweets_per_file):
        n_files += 1
        n_tweets += len(tweets_hour)
        table = get_fitted_summary(tweets_hour)
        for i, row in table.iterrows():
            if i in dist_ks_values:
                dist_ks_values[i].append(row['ks_pvalue'])
            else:
                dist_ks_values[i] = [row['ks_pvalue']]

        if n_files % 24 == 0:
            pd.DataFrame(dist_ks_values).to_pickle('../results/top_distributions_day_{}.pkl'.format(n_files//24))
            dist_ks_values = {}

    if len(dist_ks_values) > 0:
        pd.DataFrame(dist_ks_values).to_pickle('../results/top_distributions_day_{}.pkl'.format(n_files/24))

    print(f'Processed {len(tweets_per_file)} files!')
    print(f'Processed {len(tweets_per_file)//24} days!')
    print(f'Processed {n_tweets} tweets!')

    # return pd.DataFrame(dist_ks_values)

In [96]:
get_dist_ks_values()

  0%|          | 0/240 [00:00<?, ?it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 39.93it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 34.50it/s]
  1%|          | 2/240 [00:00<00:22, 10.51it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 34.91it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 41.65it/s]
  2%|▏         | 4/240 [00:00<00:22, 10.63it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 48.13it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 36.23it/s]
  2%|▎         | 6/240 [00:00<00:21, 10.95it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fi

Processed 240 files!
Processed 10 days!
Processed 8619013 tweets!





In [99]:
pkls = []
for p in range(1,11):
    read_file = f'../results/top_distributions_day_{p}.pkl'
    pkls.append(pd.read_pickle(read_file))
df_ks_values = pd.concat(pkls, ignore_index=True)

mean_pvalues = {}
for dist in df_ks_values:
    mean_pvalues[dist] = np.nanmean(df_ks_values[dist])
mean_pvalues = dict(sorted(mean_pvalues.items(), key=lambda x:x[1], reverse=True))
mean_pvalues

{'alpha': 0.0121660018143789, 'anglit': 0.0003121056207949744, '_fit': nan}