In [None]:
#!pip install researchpy

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from scipy import stats
import researchpy as rp

from statsmodels.formula.api import ols

In [4]:
# Loading `bike_sharing_data_processed.csv` dataset
df = pd.read_csv('bike_sharing_data_processed.csv')
df.head()

Unnamed: 0,season,mnth,holiday,workingday,weathersit,temp,cnt
0,1,1,0,0,2,0.344167,985
1,1,1,0,0,2,0.363478,801
2,1,1,0,1,1,0.196364,1349
3,1,1,0,1,1,0.2,1562
4,1,1,0,1,1,0.226957,1600


In [6]:
df.shape

(731, 7)

In [7]:
df['weathersit'].unique()

array([2, 1, 3], dtype=int64)

In [9]:
df.groupby('weathersit')['cnt'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
weathersit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,463.0,4876.786177,1879.483989,431.0,3636.5,4844.0,6458.5,8714.0
2,247.0,4035.862348,1809.109918,605.0,2674.5,4040.0,5257.5,8362.0
3,21.0,1803.285714,1240.284449,22.0,705.0,1817.0,2424.0,4639.0


**Categories**

*1 - Sunny day* <br>
*2 - Little more clouds*<br>
*3 - Cloudy with thunderstorms*

<h4> Hypothesis</h4>

-  Null hypothesis (H0): The means of these are the same
-  Alternative hypothesis (H1): The means of these are different

bike_sharing_data.boxplot(column=['cnt'], by='weathersit', figsize=(12, 8))

More bikes are rented during sunny day than on cloudy and thunderstorm days

<h4> Calculating one way ANOVA </h4>

In [23]:

group = [df[df['weathersit'] == i]['cnt'] for i in [1, 2, 3]]
f_statistic, p_value = stats.f_oneway(*group)

print("F-Statistic:", f_statistic)
print("P-Value:", p_value)

alpha = 0.05  #significance level
if p_value < alpha:
    print("reject the null hypothesis.")
else:
    print("accept the null hypothesis.")


F-Statistic: 40.06604492024684
P-Value: 3.106317270053755e-17
reject the null hypothesis.


 The p-value returned is very very tiny and far below 5% and this indicates the differences in the means between these 3 categories/samples are significant. Therefore, the weather situtation has an impact on the count of the number of bikes rented.

However, this test does not show which specific group's means compared with each other are different, i.e, group 1, 2 are different, or group 2, 3 are different or group 1, 3 are different. That's why we use another test, **Tukey's Honest Significance Difference test**

<h3>Tukey's Honest Significance Difference Test</h3>

In [30]:
df

Unnamed: 0,season,mnth,holiday,workingday,weathersit,temp,cnt
0,1,1,0,0,2,0.344167,985
1,1,1,0,0,2,0.363478,801
2,1,1,0,1,1,0.196364,1349
3,1,1,0,1,1,0.200000,1562
4,1,1,0,1,1,0.226957,1600
...,...,...,...,...,...,...,...
726,1,12,0,1,2,0.254167,2114
727,1,12,0,1,2,0.253333,3095
728,1,12,0,0,2,0.253333,1341
729,1,12,0,0,1,0.255833,1796


In [33]:

from statsmodels.stats.multicomp import MultiComparison
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# Create MultiComparison object
mul_com = MultiComparison(df['cnt'], df['weathersit'])

# Perform Tukey's HSD test
mul_result =  pairwise_tukeyhsd(endog=df['cnt'] , groups=df['weathersit'], alpha=0.05)

# Print Tukey HSD results
print(mul_result)


    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
group1 group2  meandiff  p-adj   lower      upper    reject
-----------------------------------------------------------
     1      2  -840.9238   0.0 -1181.6303  -500.2174   True
     1      3 -3073.5005   0.0 -4038.2458 -2108.7551   True
     2      3 -2232.5766   0.0 -3215.4542 -1249.6991   True
-----------------------------------------------------------


In [34]:
# Fill in the blank
from statsmodels.stats.multicomp import MultiComparison
from statsmodels.stats.multicomp import pairwise_tukeyhsd

mul_com = MultiComparison(df['cnt'], df['weathersit'])

mul_result = pairwise_tukeyhsd(endog=df['cnt'] , groups=df['weathersit'], alpha=0.05)

print(mul_result)

    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
group1 group2  meandiff  p-adj   lower      upper    reject
-----------------------------------------------------------
     1      2  -840.9238   0.0 -1181.6303  -500.2174   True
     1      3 -3073.5005   0.0 -4038.2458 -2108.7551   True
     2      3 -2232.5766   0.0 -3215.4542 -1249.6991   True
-----------------------------------------------------------


The above table consists of the comparison between all three groups. On the last column, the values are for the comparison of all three groups, 'True' which means we reject null hypothesis (H0) and accept the alternative hypothesis.