# NYU PUI - HOMEWORK 4 - Part 2 Citibikes

Submitted by Zoe Martiniak (zem232)

## Citi Bikes IDEA: 
#### Younger people are more likely to use Citi Bike in the evenings.

## NULL HYPOTHESIS:
#### The daily proportion of young people (the younger half of the population) is the same or higher than the proportion of the older people (the older half of the population) biking in the evenings (after 7 pm).

# _$H_0$_ : $\frac{Y_{\mathrm{evening}}}{Y_{\mathrm{total}}} - \frac{O_{\mathrm{evening}}}{O_{\mathrm{total}}}>= 0 $
# _$H_1$_ : $\frac{Y_{\mathrm{evening}}}{Y_{\mathrm{total}}} - \frac{O_{\mathrm{evening}}}{O_{\mathrm{total}}} < 0$


#### I will use a significance level  $\alpha=0.05$

#### By calculating the daily proportion for a single month, I will obtain ~30 trials to test. Therefore, a t-test will suffice.

In [1]:
from __future__ import print_function
__author__ = "Zoe Martiniak, CUSP NYU 2018"
import numpy as np
import pandas as pd
import matplotlib
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 88}

matplotlib.rc('font', **font)
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
np.random.seed(999)
import warnings
warnings.filterwarnings(action='once')
import os
import re
from datetime import datetime
from copy import deepcopy
from pprint import pprint

In [None]:
# !pip install tqdm

In [None]:
# import time
# from tqdm import *

In [2]:
def getCitiBikeCSV(datestring):
    print("Downloading",datestring)
    if not os.path.isfile(os.getenv("PUIDATA") + "/" + datestring + "-citibike-tripdata.zip"):
        if not os.path.isfile(datestring + "-citibike-tripdata.zip"):
            os.system("curl -O https://s3.amazonaws.com/tripdata/" + datestring + "-citibike-tripdata.zip")
        ##  Unzip the folder
        os.system("unzip " + datestring + "-citibike-tripdata.zip")
    ## Move to PUIdata
    os.system("mv " + datestring + "-citibike-tripdata.csv " + os.getenv("PUIDATA"))
    ## Delete zip in directory
    os.system("rm "+ datestring + "-citibike-tripdata.zip")
    ## NOTE: old csv citibike data had a different name structure... and we don't want older data anyway.
    if '2014' in datestring:
        print("Please use a more recent dataset.")
    ### One final check:
    if not os.path.isfile(os.getenv("PUIDATA") + "/" + datestring + "-citibike-tripdata.csv"):
        print ("UH OH! There seems to be something wrong: the file is not there!")
    else:
        print('File is in place, please continue.')

In [3]:
datestring = '201609'
getCitiBikeCSV(datestring)

Downloading 201609
File is in place, please continue.


In [7]:
cb_df = pd.read_csv(os.getenv("PUIDATA") + "/" + datestring + '-citibike-tripdata.csv')

In [8]:
## adding age column
cb_df['age']= 2016-cb_df['birth year']
## cleaning datafile
cb_df.drop(['stoptime', 'start station id', 'start station name', 'start station latitude',
        'start station longitude', 'end station id', 'end station name',
        'end station latitude', 'end station longitude', 'bikeid', 'usertype', 
        'birth year','gender'], axis=1, inplace=True)
cb_df.age.dropna(axis=0,inplace=True)
cb_df['starttime'].dropna(axis=0,inplace=True)

In [9]:
# chopping up data
cb_df = cb_df[::500]
cb_df.describe()

Unnamed: 0,tripduration,age
count,3298.0,2871.0
mean,1070.954215,37.678858
std,7544.933207,11.663844
min,71.0,16.0
25%,407.0,28.0
50%,691.5,35.0
75%,1158.75,46.0
max,420830.0,76.0


In [None]:
# for i in tqdm(cb_df.index):
#    cb_df['date'][i]=pd.to_datetime(cb_df['starttime'][i])

In [10]:
## adding date and time column
cb_df['date'] = pd.to_datetime(cb_df['starttime'])
cb_df.drop(['starttime'],axis=1,inplace=True)
cb_df['after_7pm'] = cb_df['date'].apply(lambda dt: dt.hour >= 19)
cb_df.head(7)

Unnamed: 0,tripduration,age,date,after_7pm
0,975,31.0,2016-09-01 00:00:02,False
500,1388,43.0,2016-09-01 01:05:28,False
1000,187,25.0,2016-09-01 05:50:19,False
1500,456,58.0,2016-09-01 06:32:22,False
2000,373,54.0,2016-09-01 07:00:11,False
2500,107,24.0,2016-09-01 07:24:23,False
3000,270,58.0,2016-09-01 07:46:19,False


In [11]:
am = cb_df['age'].mean()
age_y=cb_df[cb_df.age < am]
age_o=cb_df[cb_df.age > am]

In [12]:
am = cb_df['age'].mean()
# get ix
age_y = cb_df['age']<am
age_o = cb_df['age']>am
# make boolean flag series'
night = cb_df['date'].apply(lambda dt: dt.hour >= 19)
day = cb_df['date'].apply(lambda dt: dt.hour < 19)

# group by date and then count rows to get trip counts
# total "young people" rides = age_y
_tot_y = cb_df[age_y].groupby(cb_df[age_y]['date'].map(lambda x: x.date())).count()
# total "old people" rides = age_o
_tot_o = cb_df[age_o].groupby(cb_df[age_o]['date'].map(lambda x: x.date())).count()
# isolating young people starting trips after 7pm = age_y & night
_y_7pm_a = cb_df[age_y & night].groupby(cb_df[age_y & night]['date'].map(lambda x: x.date())).count()
# isolating old people starting trips after 7pm = age_o & night
_o_7pm_a = cb_df[age_o & day].groupby(cb_df[age_o & day]['date'].map(lambda x: x.date())).count()

In [13]:
# using _df_cnts, merge the dataframe one by one
_df_cnts = pd.DataFrame(index=_tot_y.index)
_columns = _tot_y.columns


for df in ['_tot_y', '_tot_o', '_y_7pm_a', '_o_7pm_a']:
    _df_cnts = _df_cnts.merge(
        globals()[df] # get dataframe from global frame
        , how='inner'
        , left_index=True
        , right_index=True
        , sort=True
        , suffixes=['', df]
    )
    _df_cnts.drop(
        columns=[c for c in _df_cnts.columns if not c.startswith('after_7pm') or c.startswith('cnt')]
        , inplace=True
    )
    _df_cnts.rename(
        columns = {
            'after_7pm_%s' % df: 'cnt_%s' % df 
        }
        , inplace=True
    )

_df_cnts.rename(
        columns = {
            'after_7pm': 'total_young'
            , 'after_7pm_tot_o': 'total_old'
            , 'after_7pm_y_7pm_a': 'total_young_after_7pm'
            , 'after_7pm_o_7pm_a': 'total_old_after_7pm'
        }
        , inplace=True
    )
_df_cnts.head(5)

Unnamed: 0_level_0,total_young,total_old,total_young_after_7pm,total_old_after_7pm
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-09-01,51,25,12,18
2016-09-02,49,42,6,37
2016-09-03,42,21,9,18
2016-09-04,36,21,10,19
2016-09-05,47,26,11,23


In [14]:
_df_cnts['young_proportion'] = _df_cnts['total_young_after_7pm'] / _df_cnts['total_young']
_df_cnts['old_proportion'] = _df_cnts['total_old_after_7pm'] / _df_cnts['total_old']

In [15]:
_columns_i_want = ['young_proportion', 'old_proportion']
_df_cnts.drop(columns=[c for c in _df_cnts.columns if c not in _columns_i_want], inplace=True)
_df_cnts.head(3)

Unnamed: 0_level_0,young_proportion,old_proportion
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-09-01,0.235294,0.72
2016-09-02,0.122449,0.880952
2016-09-03,0.214286,0.857143


#### Testing Hypothesis: The *daily* proportion of young people (the younger half of the population) is the same or higher than the proportion of the older people (the older half of the population) biking in the evenings (after 7 pm).

# _$H_0$_ : $\frac{Y_{\mathrm{evening}}}{Y_{\mathrm{total}}} - \frac{O_{\mathrm{evening}}}{O_{\mathrm{total}}}<= 0 $
# _$H_1$_ : $\frac{Y_{\mathrm{evening}}}{Y_{\mathrm{total}}} - \frac{O_{\mathrm{evening}}}{O_{\mathrm{total}}} > 0$

#### My test will be conducted with **95% significance level**
Since we have only 30 samples, I will be performing t-test for the difference between means of two samples