In [271]:
import pandas as pd
import numpy as np
from scipy import stats
import pingouin as pg
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [272]:
df = pd.read_csv("/Users/beans/Desktop/Mod-4-Project/data/hour.csv")

df['season'] = df['season'].replace(1,'winter')
df['season'] = df['season'].replace(2,'spring')
df['season'] = df['season'].replace(3,'summer')
df['season'] = df['season'].replace(4,'fall')


In [306]:
# STATISTICAL ANALYSIS

# Task 1

# Do average hourly rides differ between working days and non-working days?

# H 0 : Average hourly working day rides do not differ from average hourly non-working day rides.

# H 1 : Average hourly working day rides differ from average hourly non-working day rides.

alpha = 0.05


#filtering to get averge rides for both non-working and workingdays
workingday = df[df['workingday'] == 1]
notworkingday = df[df['workingday'] == 0]


randomworking = workingday.sample(5000)
randomnotworking = notworkingday.sample(5000)

workingmean = randomworking.groupby('hr')['cnt'].mean().reset_index()
notworkingmean = randomnotworking.groupby('hr')['cnt'].mean().reset_index()


# stats ttest
test = stats.ttest_ind(workingmean['cnt'],notworkingmean['cnt'], equal_var=False,alternative='two-sided')

test

# pinguoin ttest

ping = pg.ttest(x=workingmean['cnt'],y=notworkingmean['cnt'],confidence=1-alpha)
ping

# BOTH TESTS SAY THERE IS NO SIGNIFICANT DIFFERENCE BETWEEN THE RIDER AVERAGES OF WORKING DAYS AND NOT WORKING DAYS

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.213218,46,two-sided,0.832099,"[-74.91, 92.66]",0.061551,0.293,0.055009


Our two-sided two sample ttest indicates there is no statistically significant difference between hourly working day rides and hourly non-working day rides, thus we fail to reject the Null Hypothesis.

In [274]:
# Task 2

# Do mean hourly rides differ across categories of multi-level categorical variables such as
# season or weather condition (choose one)? If you find a difference, describe the appropriate
# post-hoc (after the test what other tests would you do) approach and what it would tell
# stakeholders.

# H 0 : There is no difference between hourly ride means across seasons.

# H 1 : There is a difference between houry ride means across seasons.

alpha = 0.05

#filtering to separate seasons
winter = df[df['season'] == 'winter']
spring = df[df['season'] == 'spring']
summer = df[df['season'] == 'summer']
fall = df[df['season'] == 'fall']

wintermean = winter.groupby('hr')['cnt'].mean().reset_index()
springmean = spring.groupby('hr')['cnt'].mean().reset_index()
summermean = summer.groupby('hr')['cnt'].mean().reset_index()
fallmean = fall.groupby('hr')['cnt'].mean().reset_index()

wintermean = wintermean.rename(columns={'cnt':'wintercount'})
springmean = springmean.rename(columns={'cnt':'springcount'})
summermean = summermean.rename(columns={'cnt':'summercount'})
fallmean = fallmean.rename(columns={'cnt':'fallcount'})

#merging all seasons to avoid any issues when reshaping dataframe
swseasons = pd.merge(wintermean,springmean,on='hr',how='left')
sfseasons = pd.merge(summermean,fallmean,on='hr',how='left')

fullseasons = pd.merge(swseasons,sfseasons,on='hr',how='left')
fullseasons = fullseasons.drop('hr',axis=1)

# ANOVA TEST

f_statistic, p_value = f_oneway(fullseasons['wintercount'],fullseasons['springcount'],fullseasons['summercount'],fullseasons['fallcount'])

print('P-Value is:',p_value)
print('F-statistic is :',f_statistic)
# the p_value shows at least one group's mean is statistically different from the others

# RESHAPING FOR TUKEY'S TEST

reshaped_seasons = pd.melt(fullseasons, value_vars = ['wintercount', 'springcount','summercount','fallcount'], var_name='seasons', value_name='mean_count')

tukey_results = pairwise_tukeyhsd(endog=reshaped_seasons['mean_count'], groups = reshaped_seasons['seasons'], alpha=alpha)

print(tukey_results)

P-Value is: 0.011211790379078128
F-statistic is : 3.9091771336107466
       Multiple Comparison of Means - Tukey HSD, FWER=0.05        
   group1      group2    meandiff p-adj    lower    upper   reject
------------------------------------------------------------------
  fallcount springcount    9.4842  0.995  -92.8854 111.8537  False
  fallcount summercount   37.1261 0.7785  -65.2434 139.4956  False
  fallcount wintercount   -89.223 0.1101 -191.5925  13.1465  False
springcount summercount   27.6419 0.8943  -74.7276 130.0114  False
springcount wintercount  -98.7072  0.063 -201.0767   3.6623  False
summercount wintercount -126.3491 0.0092 -228.7186 -23.9796   True
------------------------------------------------------------------


After conducting an ANOVA test to determine whether one of our categories' means is statistically different from the others, our p-value was less than our alpha. This would mean we decide to reject the Null Hypothesis and figure out there is a difference between at least one of our seasons' hourly ride mean. The next steps would be to perform a post-hoc test, such as Tukey's, to determine which combination of seasons has the statistically significant difference. After conducting our Tukey's test, I was able to find that our summer mean bike rider count is statistically different than our winter mean bike rider count, which makes sense as bike riding is far less popular over the winter than it is over the summer.

In [275]:
#TASK 3


#primary metric is "Average hourly rides (cnt) during 17:00–19:00 on working days when weather is good."
eligibledf = df[(df['workingday'] == 1) & (df['hr'].isin({17,18,19})) & ((df['weathersit'].isin({1,2})) & (df['hum'] <= 0.7))]

eligibledf['dteday'] = pd.to_datetime(eligibledf['dteday'])

#windows
#Pre (Baseline): 2012-08-04 → 2012-08-31 (inclusive)

pre = eligibledf[(eligibledf['dteday'] <= '2012-08-31') & (eligibledf['dteday'] > '2012-08-04')]

#Post (Feature On): 2012-09-01 → 2012-09-28 (inclusive)

post = eligibledf[(eligibledf['dteday'] <= '2012-09-28') & (eligibledf['dteday'] > '2012-09-01')]

#print of both pre and post
print(pre.groupby('weekday')['hr'].count())
print(post.groupby('weekday')['hr'].count())




weekday
1    10
2    10
3     9
4    12
5    12
Name: hr, dtype: int64
weekday
1     7
2     8
3    11
4    11
5    11
Name: hr, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eligibledf['dteday'] = pd.to_datetime(eligibledf['dteday'])


In [276]:
# MAKE THEM FAIR
rng = np.random.default_rng(16)


#GROUP A
monday = rng.choice(pre[pre['weekday'] == 1],size = 7,replace=False)
tuesday = rng.choice(pre[pre['weekday'] == 2],size= 8,replace=False)
wednesday = rng.choice(pre[pre['weekday'] == 3],size=9,replace=False)
thursday = rng.choice(pre[pre['weekday'] == 4],size=11,replace=False)
friday = rng.choice(pre[pre['weekday'] == 5],size=11,replace=False)

mdf = pd.DataFrame(monday)
tdf = pd.DataFrame(tuesday)
wdf = pd.DataFrame(wednesday)
thdf = pd.DataFrame(thursday)
fdf = pd.DataFrame(friday)

mt = mdf.merge(tdf,how='outer')
wth = wdf.merge(thdf,how='outer')
mtwth = mt.merge(wth, how='outer')
groupa = mtwth.merge(fdf,how='outer')

groupa = groupa.rename(columns={0:'instant', 1:'dteday', 2:'season', 3:'yr', 4:'mnth', 5:'hr', 6:'holiday', 7:'weekday',
                       8:'workingday', 9:'weathersit', 10:'temp', 11:'atemp', 12:'hum', 13:'windspeed',
                       14:'casual', 15:'registered', 16:'cnt'})

#GROUP B
wednesdayb = rng.choice(post[post['weekday'] == 3],size=9,replace=False)

wbdf = pd.DataFrame(wednesdayb)

wbdf = wbdf.rename(columns={0:'instant', 1:'dteday', 2:'season', 3:'yr', 4:'mnth', 5:'hr', 6:'holiday', 7:'weekday',
                       8:'workingday', 9:'weathersit', 10:'temp', 11:'atemp', 12:'hum', 13:'windspeed',
                       14:'casual', 15:'registered', 16:'cnt'})

groupb = post[post['weekday'].isin({1,2,4,5})]

groupb = groupb.merge(wbdf,how='outer')

#TABLE OF BOTH
print(groupa.groupby('weekday').agg({'hr':'count','weathersit':'count','hum':'mean'}))
print(groupb.groupby('weekday').agg({'hr':'count','weathersit':'count','hum':'mean'}))

         hr  weathersit       hum
weekday                          
1         7           7  0.534286
2         8           8   0.55125
3         9           9  0.526667
4        11          11  0.423636
5        11          11  0.489091
         hr  weathersit       hum
weekday                          
1         7           7  0.397143
2         8           8   0.46875
3         9           9      0.48
4        11          11  0.611818
5        11          11  0.566364


In [305]:
# STATISTICAL TEST

# H 0 : There is no difference in average total bike count between Group A (PRE) and Group B (POST)

# H 1 : There is a difference in average total bike count between Group A (PRE) and Group B (POST)

alpha = 0.05

# T TEST
groupa['cnt'] = groupa['cnt'].astype(float)
groupb['cnt'] = groupb['cnt'].astype(float)
pg.ttest(x=groupa['cnt'],y=groupb['cnt'],confidence=1-alpha)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.971522,90,two-sided,0.051736,"[-111.47, 0.43]",0.411091,1.192,0.496257


With a p-value over 0.05, technically it is not statistically significant