In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tqdm

from fitter import Fitter
from scipy.stats import exponnorm, erlang, gennorm


In [34]:
def load_data(column):
    stream = []
    days = range(1, 23) # 1, 32
    months = range(1, 2) #1, 3
    for month in months:
        month_2_digits = '{month:02}'.format(month=month)
        path = '../COVID19_Tweets_Dataset_2020/Summary_Sentiment/2020_' + month_2_digits + '/'

        for day in days:
            if month == 2 and day > 29:
                continue
            if month == 1 and day < 22:
                continue

            for hour in range(24): # 24
                file_name = path + f'2020_{month_2_digits}_' + '{day:02}'.format(day=day) + '_{hour:02}'.format(hour=hour) + '_Summary_Sentiment.csv'
                stream.append(pd.read_csv(file_name)[column])

    return pd.concat(stream, ignore_index=True), stream

In [35]:
stream_ln, tweets_per_file = load_data('Logits_Negative')

In [36]:
def get_fitted_summary(data):
    f = Fitter(data)
    f.distributions = f.distributions[:3]
    f.fit()
    return f.summary(method='ks_pvalue', plot=False, clf=False, Nbest=110)['ks_pvalue'].values


def process_dist_ks_values():
    dist_ks_values = []
    n_tweets = 0
    n_files = 0
    for tweets_hour in tqdm.tqdm(tweets_per_file):
        n_files += 1
        n_tweets += len(tweets_hour)
        dist_ks_values.append(get_fitted_summary(tweets_hour))
        # for i, row in table.iterrows():
        #     if i in dist_ks_values:
        #         dist_ks_values[i].append(row['ks_pvalue'])
        #     else:
        #         dist_ks_values[i] = [row['ks_pvalue']]

        if n_files % 24 == 0:
            pd.DataFrame(dist_ks_values).to_pickle('../results/compare_fit_files_day_{}.pkl'.format(n_files//24))
            dist_ks_values = []

    if len(dist_ks_values) > 0:
        pd.DataFrame(dist_ks_values).to_pickle('../results/compare_fit_files_day_{}.pkl'.format(n_files/24))

    print(f'Processed {len(tweets_per_file)} files!')
    print(f'Processed {len(tweets_per_file)//24} days!')
    print(f'Processed {n_tweets} tweets!')
    

In [37]:
process_dist_ks_values()

  0%|          | 0/24 [00:00<?, ?it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 44.86it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 32.71it/s]
  8%|▊         | 2/24 [00:00<00:02, 10.64it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 31.07it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 38.02it/s]
 17%|█▋        | 4/24 [00:00<00:01, 10.09it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 47.28it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 39.01it/s]
 25%|██▌       | 6/24 [00:00<00:01, 10.84it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fittin

Processed 24 files!
Processed 1 days!
Processed 36333 tweets!





In [63]:
pkls = []
for p in range(1,2):
    read_file = f'../results/compare_fit_files_day_{p}.pkl'
    pkls.append(pd.read_pickle(read_file))
df_ks_values = pd.concat(pkls, ignore_index=True)


ks_list = []
for _, i in df_ks_values.iterrows():
    ks_list.append(i)

np.mean([i>=.01 for i in np.concatenate(ks_list)])

0.1388888888888889

In [64]:
dist_ks_values = []
for tweets_hour in tqdm.tqdm(tweets_per_file):
    dist_ks_values.append(get_fitted_summary(tweets_hour))

dist_ks_values
# np.concatenate(dist_ks_values)

  0%|          | 0/24 [00:00<?, ?it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 37.83it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 33.38it/s]
  8%|▊         | 2/24 [00:00<00:02, 10.23it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 30.54it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 23.99it/s]
 17%|█▋        | 4/24 [00:00<00:02,  8.78it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 45.19it/s]
SKIPPED _fit distribution (taking more than 30 seconds)
Fitting 3 distributions: 100%|██████████| 3/3 [00:00<00:00, 38.60it/s]
 25%|██▌       | 6/24 [00:00<00:01,  9.83it/s]SKIPPED _fit distribution (taking more than 30 seconds)
Fittin

[array([0.00089223, 0.46333077,        nan]),
 array([0.00042777, 0.17386592,        nan]),
 array([0.07333744, 0.32837543,        nan]),
 array([1.90203316e-06, 2.84240166e-03,            nan]),
 array([2.24442977e-12, 1.36628546e-01,            nan]),
 array([6.40937038e-11, 2.53480719e-01,            nan]),
 array([2.45996920e-04, 6.25114049e-01,            nan]),
 array([5.68643081e-09, 3.38635672e-01,            nan]),
 array([3.36756643e-09, 5.72076620e-01,            nan]),
 array([7.80507712e-53, 8.45233375e-19,            nan]),
 array([2.40427150e-34, 9.53413609e-07,            nan]),
 array([1.20210672e-29, 6.92162825e-06,            nan]),
 array([3.78561317e-63, 2.35017243e-02,            nan]),
 array([1.99894717e-83, 3.91800486e-51,            nan]),
 array([1.57536446e-74, 1.64967840e-09,            nan]),
 array([2.72779449e-35, 1.01966754e-09,            nan]),
 array([1.71738182e-39, 7.88013988e-07,            nan]),
 array([1.58240580e-47, 1.97947855e-03,           

In [None]:
# np.mean([i>=.01 for i in get_fitted_ks_values(stream_ln)])