# Data Analysis: OLS regressions WITHOUT weather variables

## This notebook:
1. Initiates several OLS regressions
2. Weather variables are later added in in ('Anls_OLS_weather.ipynb')

In [3]:
import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.formula.api as smf

In [4]:
df = pd.read_csv('merged2_event_CB2016_800m.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,...,usertype,birth year,gender,startdate,stopdate,Event_type,End_Time,weekday,O_date,O_hour
0,0,173,16:03:10,16:06:03,243,Fulton St & Rockwell Pl,40.688226,-73.979382,241,DeKalb Ave & S Portland Ave,...,Subscriber,1971.0,2,2016-01-01,2016-01-01,no-event,,,2016-01-01,16
1,1,136,16:05:54,16:08:11,420,Clermont Ave & Lafayette Ave,40.687645,-73.969689,270,Adelphi St & Myrtle Ave,...,Subscriber,1980.0,1,2016-01-01,2016-01-01,no-event,,,2016-01-01,16
2,2,653,16:13:47,16:24:40,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,278,Concord St & Bridge St,...,Subscriber,1976.0,1,2016-01-01,2016-01-01,no-event,,,2016-01-01,16
3,3,659,16:13:47,16:24:46,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,278,Concord St & Bridge St,...,Subscriber,1985.0,2,2016-01-01,2016-01-01,no-event,,,2016-01-01,16
4,4,1419,16:20:39,16:44:19,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,532,S 5 Pl & S 4 St,...,Subscriber,1993.0,1,2016-01-01,2016-01-01,no-event,,,2016-01-01,16


In [5]:
df['startdate'] = pd.to_datetime(df['startdate'])
df['month'] = df['startdate'].dt.month
df['weekday'] = [i not in [5,6] for i in df['startdate'].dt.weekday.values]

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,...,birth year,gender,startdate,stopdate,Event_type,End_Time,weekday,O_date,O_hour,month
0,0,173,16:03:10,16:06:03,243,Fulton St & Rockwell Pl,40.688226,-73.979382,241,DeKalb Ave & S Portland Ave,...,1971.0,2,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1
1,1,136,16:05:54,16:08:11,420,Clermont Ave & Lafayette Ave,40.687645,-73.969689,270,Adelphi St & Myrtle Ave,...,1980.0,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1
2,2,653,16:13:47,16:24:40,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,278,Concord St & Bridge St,...,1976.0,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1
3,3,659,16:13:47,16:24:46,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,278,Concord St & Bridge St,...,1985.0,2,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1
4,4,1419,16:20:39,16:44:19,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,532,S 5 Pl & S 4 St,...,1993.0,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1


In [7]:
df['Event_type'].unique()

array(['no-event', 'basketball', 'boxing', 'concert', 'other', 'hockey',
       'family'], dtype=object)

In [8]:
# df['Event_type'][df['Event_type'].isnull()] = 'none'

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,...,birth year,gender,startdate,stopdate,Event_type,End_Time,weekday,O_date,O_hour,month
0,0,173,16:03:10,16:06:03,243,Fulton St & Rockwell Pl,40.688226,-73.979382,241,DeKalb Ave & S Portland Ave,...,1971.0,2,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1
1,1,136,16:05:54,16:08:11,420,Clermont Ave & Lafayette Ave,40.687645,-73.969689,270,Adelphi St & Myrtle Ave,...,1980.0,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1
2,2,653,16:13:47,16:24:40,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,278,Concord St & Bridge St,...,1976.0,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1
3,3,659,16:13:47,16:24:46,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,278,Concord St & Bridge St,...,1985.0,2,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1
4,4,1419,16:20:39,16:44:19,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,532,S 5 Pl & S 4 St,...,1993.0,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1


In [19]:
# define season
df['season'] = df['month'].map({1: 1,
                               2: 1,
                               3: 2,
                               4: 2,
                               5: 2,
                               6: 3,
                               7: 3,
                               8: 3,
                               9: 4,
                               10: 4,
                               11: 4,
                               12: 1})
df.head()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,...,gender,startdate,stopdate,Event_type,End_Time,weekday,O_date,O_hour,month,season
0,0,173,16:03:10,16:06:03,243,Fulton St & Rockwell Pl,40.688226,-73.979382,241,DeKalb Ave & S Portland Ave,...,2,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1,1
1,1,136,16:05:54,16:08:11,420,Clermont Ave & Lafayette Ave,40.687645,-73.969689,270,Adelphi St & Myrtle Ave,...,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1,1
2,2,653,16:13:47,16:24:40,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,278,Concord St & Bridge St,...,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1,1
3,3,659,16:13:47,16:24:46,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,278,Concord St & Bridge St,...,2,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1,1
4,4,1419,16:20:39,16:44:19,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,532,S 5 Pl & S 4 St,...,1,2016-01-01,2016-01-01,no-event,,True,2016-01-01,16,1,1


In [27]:
d = pd.DataFrame(df.groupby(['O_date','Event_type','O_hour','weekday','season'],as_index=False).size())
d.reset_index(inplace=True)
d.columns = ['O_date','Event_type','O_hour','weekday','season','Count']
d.head()

Unnamed: 0,O_date,Event_type,O_hour,weekday,season,Count
0,2016-01-01,no-event,16,True,1,8
1,2016-01-01,no-event,17,True,1,7
2,2016-01-01,no-event,18,True,1,4
3,2016-01-01,no-event,19,True,1,2
4,2016-01-01,no-event,20,True,1,2


In [55]:
lm0 = smf.ols(formula="Count ~ weekday + O_hour + C(Event_type, Treatment(reference='no-event'))",data=d).fit()
lm0.summary()

f = open('lm0.txt', 'w')
f.write(lm0.summary().as_text())
f.close()

In [57]:
lm1 = smf.ols(formula="Count ~ weekday + O_hour * C(Event_type, Treatment(reference='no-event'))",data=d).fit()
lm1.summary()

f = open('lm1.txt', 'w')
f.write(lm1.summary().as_text())
f.close()

In [58]:
lm2 = smf.ols(formula="Count ~ season + weekday + O_hour * C(Event_type, Treatment(reference='no-event'))",data=d).fit()
lm2.summary()

f = open('lm2.txt', 'w')
f.write(lm2.summary().as_text())
f.close()

In [59]:
lm3 = smf.ols(formula="Count ~ weekday + season * O_hour * C(Event_type, Treatment(reference='no-event'))",data=d).fit()
lm3.summary()

f = open('lm3.txt', 'w')
f.write(lm3.summary().as_text())
f.close()

## Run regression on Aug-Dec only

In [50]:
d2 = d[pd.to_datetime(d['O_date']).dt.month > 7]

In [60]:
lm4 = smf.ols(formula="Count ~ weekday + season * O_hour * C(Event_type, Treatment(reference='no-event'))",data=d2).fit()
lm4.summary()

f = open('lm4.txt', 'w')
f.write(lm4.summary().as_text())
f.close()