In [2]:
from __future__  import print_function, division
import pylab as pl
import pandas as pd
import numpy as np
import os
%pylab inline

Populating the interactive namespace from numpy and matplotlib


#### Idea: Women are less likely than men to take long trips, like across boroughs (sadly borough is not in the data, and geocoding is probably outside the scope of this assignment).

##### *Null Hypothesis:* The proportion of trips taken by women riders that are more than 15 minutes in duration is the same or higher than the proportion of trips taken by men riders that are more than 15 minutes in duration.

##### *Alternative Hypothesis:* The proportion of trips taken by women riders are more than 15 minutes in duration is less than the proportion of trips taken by men riders that are more than 15 minutes in duration.

##### *Confidence Level:* I will use a significance level $\\alpha=0.05$

#### Null Hypothesis Formula

$$\frac{\textrm{Women Rider > 15m Duration}}{\textrm{Women Rider All Trips}} - \frac{\textrm{Male Rider > 15m Duration}}{\textrm{Male Rider All Trips}} >= 0$$

#### Alternative Hypothesis Formula

$$\frac{\textrm{Women Rider > 15m Duration}}{\textrm{Women Rider All Trips}} - \frac{\textrm{Male Rider > 15m Duration}}{\textrm{Male Rider All Trips}} < 0$$

##### *Data:* I will use three months' worth of Citibike data from March 2015 to May 2015

In [34]:
os.environ["PUIDATA"] = "%s/PUIData"%os.getenv("HOME")

In [35]:
os.chdir(os.getenv("PUIDATA"))

In [36]:
os.getcwd()

'/nfshome/mrn291/PUIData'

In [37]:
# Read in Data from Citibike Website
march_url = 'https://s3.amazonaws.com/tripdata/201503-citibike-tripdata.zip'
april_url = 'https://s3.amazonaws.com/tripdata/201504-citibike-tripdata.zip'
may_url = 'https://s3.amazonaws.com/tripdata/201505-citibike-tripdata.zip'

In [38]:
url_set = [march_url, april_url, may_url]

In [44]:
for url in url_set:
    os.system("curl -O " + url)
    os.system("unzip " + os.getenv("PUIDATA") + "/" + url[-28:])

In [46]:
os.listdir(os.getcwd())

['201503-citibike-tripdata.zip',
 '201505-citibike-tripdata.zip',
 '201504-citibike-tripdata.zip',
 '201503-citibike-tripdata.csv',
 '201505-citibike-tripdata.csv',
 'gittest_mrn',
 '201504-citibike-tripdata.csv']

In [48]:
march_data = pd.read_csv('201503-citibike-tripdata.csv')

In [49]:
april_data = pd.read_csv('201504-citibike-tripdata.csv')

In [50]:
may_data = pd.read_csv('201505-citibike-tripdata.csv')

In [51]:
#concatenate the three dataframes together into one.
frames = [march_data, april_data, may_data]

citibike_data = pd.concat(frames, ignore_index=True)

In [52]:
citibike_data.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,669,3/1/2015 0:00,3/1/2015 0:11,164,E 47 St & 2 Ave,40.753231,-73.970325,477,W 41 St & 8 Ave,40.756405,-73.990026,21409,Subscriber,1987.0,1
1,750,3/1/2015 0:01,3/1/2015 0:14,258,DeKalb Ave & Vanderbilt Ave,40.689407,-73.968855,436,Hancock St & Bedford Ave,40.682166,-73.95399,19397,Subscriber,1968.0,1
2,663,3/1/2015 0:01,3/1/2015 0:12,497,E 17 St & Broadway,40.73705,-73.990093,477,W 41 St & 8 Ave,40.756405,-73.990026,20998,Customer,,0
3,480,3/1/2015 0:02,3/1/2015 0:10,470,W 20 St & 8 Ave,40.743453,-74.00004,491,E 24 St & Park Ave S,40.740964,-73.986022,21565,Subscriber,1983.0,1
4,1258,3/1/2015 0:02,3/1/2015 0:23,345,W 13 St & 6 Ave,40.736494,-73.997044,473,Rivington St & Chrystie St,40.721101,-73.991925,14693,Subscriber,1970.0,1


In [55]:
# make a subset that is just the variables of interest
# note that gender coded as 0=unknown; 1=male; 2=female
variables = ['tripduration','gender']

citibike_subset = citibike_data.loc[:, citibike_data.columns.isin(variables)]

In [56]:
citibike_subset.head()

Unnamed: 0,tripduration,gender
0,669,1
1,750,1
2,663,0
3,480,1
4,1258,1


In [None]:
#plot the variables of interest
fig = pl.figure(figsize(15,15))

