In [51]:
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

from tqdm import tqdm_notebook

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rcParams['figure.figsize'] = 15, 6

np.random.seed(123456)
gdrive = lambda f: '/Users/bensteers/Google Drive (bs3639@nyu.edu)/Data Science/ADS Project/1_DATA/' + f


In [2]:
df = pd.read_csv('mta-weather-simplified.csv').drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,STATION,Date,TIMEINT,ENTRIES,Temperature,Humidity,Rain,Snow,Clear,Cloudy
0,1 AV,2016-01-01,1,1407774726,41.275,53.5,0.0,0.0,0.0,1.0
1,1 AV,2016-01-01,1,1407774726,34.0,48.5,0.0,0.0,0.75,0.25
2,1 AV,2016-01-01,1,1407774726,35.775,52.5,0.0,0.0,1.0,0.0
3,1 AV,2016-01-01,1,1407774726,35.275,55.0,0.0,0.0,0.0,1.0
4,103 ST,2016-01-01,1,124276833,41.275,53.5,0.0,0.0,0.0,1.0


In [92]:
df = pd.read_pickle('final_dataset_norm.pkl')
print(df.columns)
print(df.shape)
df = df.dropna()
print(df.shape)
df.head()

Index([u'STATION', u'Date', u'TIMEINT', u'ENTRIES_hourly', u'Clear', u'Fog',
       u'Haze', u'Heavy Rain', u'Heavy Snow', u'Humidity',
       u'Light Freezing Fog', u'Light Freezing Rain', u'Light Rain',
       u'Light Snow', u'Mostly Cloudy', u'Overcast', u'Partly Cloudy', u'Rain',
       u'Scattered Clouds', u'Snow', u'Temperature', u'datetime',
       u'avg_entries', u'norm_entries'],
      dtype='object')
(765614, 24)
(750098, 24)


Unnamed: 0,STATION,Date,TIMEINT,ENTRIES_hourly,Clear,Fog,Haze,Heavy Rain,Heavy Snow,Humidity,...,Mostly Cloudy,Overcast,Partly Cloudy,Rain,Scattered Clouds,Snow,Temperature,datetime,avg_entries,norm_entries
0,1 AV,2016-01-01,1,3056.0,0.0,0.0,0.0,0.0,0.0,53.5,...,0.0,1.0,0.0,0.0,0.0,0.0,41.275,2016-01-01 00:00:00,1163.698113,1892.301887
1,1 AV,2016-01-01,2,1070.0,0.0,0.0,0.0,0.0,0.0,58.25,...,0.0,1.0,0.0,0.0,0.0,0.0,39.45,2016-01-01 04:00:00,1276.981132,-206.981132
2,1 AV,2016-01-01,3,972.0,0.0,0.0,0.0,0.0,0.0,53.5,...,0.0,0.75,0.25,0.0,0.0,0.0,39.675,2016-01-01 08:00:00,5167.962264,-4195.962264
3,1 AV,2016-01-01,4,3031.0,0.0,0.0,0.0,0.0,0.0,49.25,...,0.0,1.0,0.0,0.0,0.0,0.0,39.45,2016-01-01 12:00:00,4736.396226,-1705.396226
4,1 AV,2016-01-01,5,3812.0,0.75,0.0,0.0,0.0,0.0,48.5,...,0.0,0.0,0.25,0.0,0.0,0.0,37.45,2016-01-01 16:00:00,6959.692308,-3147.692308


In [93]:
# Get all weather condition columns
nonfeatures = ['Date', 'STATION', 'TIMEINT', 'ENTRIES_hourly', 'datetime', 'avg_entries', 'norm_entries']
conditions = df.drop(nonfeatures + ['Temperature', 'Humidity'], axis=1).columns
conditions

Index([u'Clear', u'Fog', u'Haze', u'Heavy Rain', u'Heavy Snow',
       u'Light Freezing Fog', u'Light Freezing Rain', u'Light Rain',
       u'Light Snow', u'Mostly Cloudy', u'Overcast', u'Partly Cloudy', u'Rain',
       u'Scattered Clouds', u'Snow'],
      dtype='object')

In [94]:
# Merge the weather conditions
df['Rain'] += df['Heavy Rain'] + df['Light Rain'] + df['Light Freezing Rain']
df['Snow'] += df['Heavy Snow'] + df['Light Snow']
df['Cloudy'] = (df['Partly Cloudy'] + df['Mostly Cloudy']
                + df['Overcast'] + df['Fog'] + df['Haze'] + df['Light Freezing Fog'])
df['Clear'] += df['Scattered Clouds']

# Get rid of columns that we don't want
df = df.drop([c for c in conditions if c not in ['Rain', 'Snow', 'Cloudy', 'Clear']], axis=1)
df.head()

Unnamed: 0,STATION,Date,TIMEINT,ENTRIES_hourly,Clear,Humidity,Rain,Snow,Temperature,datetime,avg_entries,norm_entries,Cloudy
0,1 AV,2016-01-01,1,3056.0,0.0,53.5,0.0,0.0,41.275,2016-01-01 00:00:00,1163.698113,1892.301887,1.0
1,1 AV,2016-01-01,2,1070.0,0.0,58.25,0.0,0.0,39.45,2016-01-01 04:00:00,1276.981132,-206.981132,1.0
2,1 AV,2016-01-01,3,972.0,0.0,53.5,0.0,0.0,39.675,2016-01-01 08:00:00,5167.962264,-4195.962264,1.0
3,1 AV,2016-01-01,4,3031.0,0.0,49.25,0.0,0.0,39.45,2016-01-01 12:00:00,4736.396226,-1705.396226,1.0
4,1 AV,2016-01-01,5,3812.0,0.75,48.5,0.0,0.0,37.45,2016-01-01 16:00:00,6959.692308,-3147.692308,0.25


In [95]:
df[['Rain', 'Snow', 'Cloudy', 'Clear', 'Temperature', 'Humidity', 'datetime', 'norm_entries']].to_pickle(gdrive('final_dataset_norm2.pkl'))

In [96]:
X = df.drop(nonfeatures, axis=1)
y = df['norm_entries']
print(X.shape)
print(y.head())
X.head()

(750098, 6)
0    1892.301887
1    -206.981132
2   -4195.962264
3   -1705.396226
4   -3147.692308
Name: norm_entries, dtype: float64


Unnamed: 0,Clear,Humidity,Rain,Snow,Temperature,Cloudy
0,0.0,53.5,0.0,0.0,41.275,1.0
1,0.0,58.25,0.0,0.0,39.45,1.0
2,0.0,53.5,0.0,0.0,39.675,1.0
3,0.0,49.25,0.0,0.0,39.45,1.0
4,0.75,48.5,0.0,0.0,37.45,0.25


In [48]:
X[np.any(~np.isfinite(X), axis=1)]

Unnamed: 0,Clear,Humidity,Rain,Snow,Temperature,Cloudy


In [81]:
rm = Ridge().fit(X, y)

In [82]:
rm.coef_

array([ -34.12076069,   -1.31506125,    1.84988927, -439.2289521 ,
         -0.60570508,  -85.96788072])

In [83]:
rm.score(X, y)

0.0021532798738603809

In [84]:
from sklearn.model_selection import train_test_split

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [88]:
nn = MLPRegressor((20,)).fit(X_train, y_train)

<IPython.core.display.Javascript object>

In [91]:
nn.score(X_test, y_test)

0.0066630143088315386

In [102]:
models = [
    ('LinR', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('Dtree', DecisionTreeRegressor()),
    ('RF', RandomForestRegressor()),
    ('MLP', MLPRegressor()), # 22s, did not converge w/ 10,000 samples
    ('lin SVR', SVR(kernel="linear")),
#     ('rbf SVR', SVR()),
]
names = [n for n, m in models]

In [None]:
# View cross validation results - accuracy ean and standard deviation
try: 
    results = []
    for name, model in models:
        t = time.time()
        print('starting', name)
        cv_result = cross_val_score(model, X, y, cv=KFold(n_splits=10), verbose=1, n_jobs=10)
        results.append(cv_result)
        print('{}: {:.4f} (std: {:.4f}) took {:.2f}s'.format(name, cv_result.mean(), cv_result.std(), time.time() - t))
except KeyboardInterrupt:
    pass

# boxplot algorithm comparison
plt.title('Algorithm Comparison')
plt.boxplot(results)
plt.ylabel('Accuracy')
plt.xticks(np.arange(len(results))+1, names[:len(results)])
plt.savefig('clsf-comparison-{}.png'.format(len(models)));

('starting', 'LinR')


Process PoolWorker-76:
Process PoolWorker-75:
Process PoolWorker-78:
Process PoolWorker-77:
Process PoolWorker-80:
Process PoolWorker-73:
Process PoolWorker-79:
Process PoolWorker-74:
Process PoolWorker-72:
Process PoolWorker-71:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "//anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _

In [101]:
import statsmodels.api as sm
lm = sm.OLS(y, X).fit()
lm.summary()

NameError: name 'ls' is not defined

In [None]:
# ridge regression, but summary doesn't work for fit_regularized, idk why
lm = sm.OLS(y, X).fit_regularized(L1_wt=0).summary()