In [1]:
import sys
import numpy as np
import pandas as pd
from scipy import stats

def get_id(date):
    id = str(date.isocalendar().year)+"-"+str(date.isocalendar().week)
    return id
    
    

OUTPUT_TEMPLATE = (
    "Initial T-test p-value: {initial_ttest_p:.3g}\n"
    "Original data normality p-values: {initial_weekday_normality_p:.3g} {initial_weekend_normality_p:.3g}\n"
    "Original data equal-variance p-value: {initial_levene_p:.3g}\n"
    "Transformed data normality p-values: {transformed_weekday_normality_p:.3g} {transformed_weekend_normality_p:.3g}\n"
    "Transformed data equal-variance p-value: {transformed_levene_p:.3g}\n"
    "Weekly data normality p-values: {weekly_weekday_normality_p:.3g} {weekly_weekend_normality_p:.3g}\n"
    "Weekly data equal-variance p-value: {weekly_levene_p:.3g}\n"
    "Weekly T-test p-value: {weekly_ttest_p:.3g}\n"
    "Mann-Whitney U-test p-value: {utest_p:.3g}"
)






In [2]:


    # ...

print(OUTPUT_TEMPLATE.format(
    initial_ttest_p=0,
    initial_weekday_normality_p=0,
    initial_weekend_normality_p=0,
    initial_levene_p=0,
    transformed_weekday_normality_p=0,
    transformed_weekend_normality_p=0,
    transformed_levene_p=0,
    weekly_weekday_normality_p=0,
    weekly_weekend_normality_p=0,
    weekly_levene_p=0,
    weekly_ttest_p=0,
    utest_p=0,
))


Initial T-test p-value: 0
Original data normality p-values: 0 0
Original data equal-variance p-value: 0
Transformed data normality p-values: 0 0
Transformed data equal-variance p-value: 0
Weekly data normality p-values: 0 0
Weekly data equal-variance p-value: 0
Weekly T-test p-value: 0
Mann-Whitney U-test p-value: 0


In [3]:
reddit_counts = 'reddit-counts.json.gz'
counts = pd.read_json(reddit_counts, lines=True)
counts

Unnamed: 0,date,subreddit,comment_count
0,2012-02-20,newfoundland,7
1,2015-01-26,Manitoba,1
2,2013-09-07,Yukon,2
3,2014-02-15,saskatchewan,5
4,2014-07-06,canada,1652
...,...,...,...
15465,2012-05-21,Quebec,365
15466,2012-05-21,britishcolumbia,4
15467,2013-09-07,britishcolumbia,5
15468,2011-09-10,Quebec,2


In [4]:
counts = counts[counts['subreddit']=='canada']
counts = counts[(counts['date'].dt.year == 2012) | (counts['date'].dt.year == 2013)] 
counts

Unnamed: 0,date,subreddit,comment_count
66,2013-03-14,canada,1657
69,2013-07-08,canada,1369
97,2012-07-04,canada,1343
115,2013-03-11,canada,1619
165,2013-09-11,canada,1909
...,...,...,...
15389,2013-01-01,canada,2113
15413,2013-07-27,canada,1070
15430,2012-10-19,canada,1486
15456,2012-01-15,canada,1256


In [5]:
weekday = counts[(counts['date'].dt.dayofweek != 5) & (counts['date'].dt.dayofweek != 6)]
weekday.reset_index()

Unnamed: 0,index,date,subreddit,comment_count
0,66,2013-03-14,canada,1657
1,69,2013-07-08,canada,1369
2,97,2012-07-04,canada,1343
3,115,2013-03-11,canada,1619
4,165,2013-09-11,canada,1909
...,...,...,...,...
517,15357,2013-04-10,canada,2021
518,15363,2013-03-19,canada,1630
519,15389,2013-01-01,canada,2113
520,15430,2012-10-19,canada,1486


In [6]:
weekend = counts[(counts['date'].dt.dayofweek == 5) | (counts['date'].dt.dayofweek == 6)]
weekend.reset_index()

Unnamed: 0,index,date,subreddit,comment_count
0,179,2012-02-04,canada,1196
1,251,2012-11-17,canada,1570
2,401,2013-07-14,canada,908
3,479,2013-06-22,canada,984
4,495,2012-07-29,canada,1199
...,...,...,...,...
204,15219,2012-11-04,canada,1772
205,15273,2012-12-09,canada,1350
206,15308,2012-12-02,canada,1725
207,15413,2013-07-27,canada,1070


In [7]:
p = stats.ttest_ind(weekday['comment_count'], weekend['comment_count']).pvalue
p
# p value is small, so we reject H null

1.3005502847207912e-58

In [8]:
weekday_normal = stats.normaltest(weekday['comment_count']).pvalue
weekend_normal = stats.normaltest(weekend['comment_count']).pvalue
print(weekday_normal, weekend_normal)
# we want p > 0.05 so the data is normal, but it is not

1.0091137251707994e-07 0.0015209196859635404


In [9]:
equal_var = stats.levene(weekday['comment_count'], weekend['comment_count']).pvalue
print(equal_var)
# we want p > 0.05 so the datasets have similar variances, but they do not

0.04378740989202803


In [10]:
#transform
weekday2 = weekday.copy(deep=True)
# weekday2['comment_count'] = np.log(weekday2['comment_count'])
# weekday2['comment_count'] = np.exp(weekday2['comment_count'])
weekday2['comment_count'] = np.sqrt(weekday2['comment_count'])
# weekday2['comment_count'] = weekday2['comment_count']**2


weekend2 = weekend.copy(deep=True)
# weekend2['comment_count'] = np.log(weekend2['comment_count'])
# weekend2['comment_count'] = np.exp(weekend2['comment_count'])
weekend2['comment_count'] = np.sqrt(weekend2['comment_count'])
# weekend2['comment_count'] = weekend2['comment_count']**2

In [11]:
weekday2_normal = stats.normaltest(weekday2['comment_count']).pvalue
weekend2_normal = stats.normaltest(weekend2['comment_count']).pvalue
print(weekday2_normal, weekend2_normal)
# np.log passes normal test for weekend, np.sqrt passes normal for weekend, weekday is almost there but not quite

equal_var2 = stats.levene(weekday2['comment_count'], weekend2['comment_count']).pvalue
print(equal_var2)

0.03687221613365365 0.10760562894666933
0.5560544297516696


In [12]:
weekday3 = weekday.copy(deep=True)
weekday3['id'] = weekday3['date'].apply(get_id)
weekday3 = weekday3.groupby('id').mean()

weekend3 = weekend.copy(deep=True)
weekend3['id'] = weekend3['date'].apply(get_id)
weekend3 = weekend3.groupby('id').mean()
weekend3

Unnamed: 0_level_0,comment_count
id,Unnamed: 1_level_1
2011-52,995.0
2012-1,1163.0
2012-10,1353.0
2012-11,1282.0
2012-12,1759.0
...,...
2013-52,1117.5
2013-6,1718.5
2013-7,786.5
2013-8,1863.5


In [13]:
weekday3_normal = stats.normaltest(weekday3['comment_count']).pvalue
weekend3_normal = stats.normaltest(weekend3['comment_count']).pvalue
print(weekday3_normal, weekend3_normal)
equal_var3 = stats.levene(weekday3['comment_count'], weekend3['comment_count']).pvalue
print(equal_var3)

# p values are good for the normal and equal variance tests, so proceed with t test

p3 = stats.ttest_ind(weekday3['comment_count'], weekend3['comment_count']).pvalue
print(p3)

0.3082637390825463 0.15294924717078573
0.20383788083573426
1.3353656052303141e-34


In [14]:
utest = stats.mannwhitneyu(weekday['comment_count'], weekend['comment_count']).pvalue
print(utest)

8.6244532347343e-53


In [15]:
print(OUTPUT_TEMPLATE.format(
    initial_ttest_p = p,
    initial_weekday_normality_p = weekday_normal,
    initial_weekend_normality_p = weekend_normal,
    initial_levene_p = equal_var,
    transformed_weekday_normality_p = weekday2_normal,
    transformed_weekend_normality_p = weekend2_normal,
    transformed_levene_p = equal_var2,
    weekly_weekday_normality_p = weekday3_normal,
    weekly_weekend_normality_p = weekend3_normal,
    weekly_levene_p = equal_var3,
    weekly_ttest_p = p3,
    utest_p = utest,
)) 

Initial T-test p-value: 1.3e-58
Original data normality p-values: 1.01e-07 0.00152
Original data equal-variance p-value: 0.0438
Transformed data normality p-values: 0.0369 0.108
Transformed data equal-variance p-value: 0.556
Weekly data normality p-values: 0.308 0.153
Weekly data equal-variance p-value: 0.204
Weekly T-test p-value: 1.34e-34
Mann-Whitney U-test p-value: 8.62e-53
