[View in Colaboratory](https://colab.research.google.com/github/zekunanu/uda-deeplearning-t2/blob/master/pm2.5.ipynb)

In [0]:
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#2. Get the file
downloaded_train = drive.CreateFile({'id':'14wOsMdYgen7MJPOnxwfx_d60ZX2sioGQ'}) # replace the id with id of file you want to access
downloaded_train.GetContentFile('train.csv')  
downloaded_test = drive.CreateFile({'id':'1bGyYFNINqVdDAyWw2w0uFrx4xGHNebxQ'}) # replace the id with id of file you want to access
downloaded_test.GetContentFile('test.csv')  


In [0]:
 #3. Read file as panda dataframe
import pandas as pd
import numpy as np
from pylab import *
import matplotlib.pyplot as plot
train_data= pd.read_csv('train.csv')
test_data = pd.read_csv("test.csv")

In [0]:
from sklearn.utils import shuffle
train_data = shuffle(train_data)

In [320]:
train_data.head(5)

Unnamed: 0,day,hour,pm2.5,dew_point,temperature,pressure,wind_speed,snow_time,rain_time,wind_ne,wind_nw,wind_se,wind_cv
1976,2010/5/2,0,75,3,21.0,1001.0,66.59,0,0,0,0,1,0
29166,2014/11/25,5,121,-4,0.0,1023.0,2.67,0,0,0,0,0,1
22862,2013/11/18,14,13,-20,10.0,1025.0,225.33,0,0,0,1,0,0
17439,2013/1/4,16,79,-23,-4.0,1029.0,3.13,0,0,0,0,1,0
13805,2012/5/18,3,35,8,15.0,1008.0,0.89,0,0,0,0,0,1


In [100]:
train_data.describe()

Unnamed: 0,hour,pm2.5,dew_point,temperature,pressure,wind_speed,snow_time,rain_time,wind_ne,wind_nw,wind_se,wind_cv,temp
count,29757.0,29757.0,29757.0,29757.0,29757.0,29757.0,29757.0,29757.0,29757.0,29757.0,29757.0,29757.0,29731.0
mean,11.504049,98.975737,1.884935,12.445643,1016.46374,23.080395,0.065564,0.217562,0.113755,0.315858,0.354102,0.216285,inf
std,6.923673,90.950492,14.263474,12.163588,10.217665,48.043313,0.853558,1.570684,0.317518,0.464865,0.478248,0.411718,
min,0.0,0.0,-40.0,-19.0,992.0,0.45,0.0,0.0,0.0,0.0,0.0,0.0,-32.0
25%,5.0,31.0,-10.0,2.0,1008.0,1.79,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
50%,12.0,74.0,2.0,14.0,1016.0,5.37,0.0,0.0,0.0,0.0,0.0,0.0,1.055556
75%,18.0,138.0,15.0,23.0,1025.0,21.46,0.0,0.0,0.0,1.0,1.0,0.0,1.5
max,23.0,994.0,28.0,41.0,1043.0,565.49,27.0,36.0,1.0,1.0,1.0,1.0,inf


In [0]:
def preprocess_features(data):
    """
    Split day to year,month,Day three columns
    one-hot encoding year,month,date
    """
    data_copy = data.copy()
    data_copy["year"]=pd.to_datetime(data_copy["day"]).dt.strftime('%Y').astype('int64')
    data_copy["Month"]=pd.to_datetime(data_copy["day"]).dt.strftime('%m').astype('int64')
    data_copy["Day"]=pd.to_datetime(data_copy["day"]).dt.strftime('%d').astype('int64')
    data_copy["Season"]= pd.cut(data_copy["Month"],4)
    data_copy["hourbin"]= pd.cut(data_copy["hour"].astype("int64"),4)
    #data_copy=data_copy.drop("day",axis=1)
    #print(data_copy.columns)
    dummy_fields = ['Season','hourbin']
    for each in dummy_fields:
        dummies = pd.get_dummies(data_copy[each], prefix=each)
        data_copy = pd.concat([data_copy, dummies], axis=1)

   
    fields_to_drop =["day",'year','Season','hourbin']
    
    data_copy = data_copy.drop(fields_to_drop, axis=1)
    return data_copy
def preprocess_targets(data):
    output_targets=pd.DataFrame()
    output_targets["pm2.5"] = data["pm2.5"]
    return output_targets
stored_scale={}
def linear_scale(data,normalize_cols):
    data_copy = data.copy()
    for col in normalize_cols:

        min_val = data_copy[col].min()
        max_val = data_copy[col].max()
        stored_scale[col]=(max_val,min_val)
        scale = (max_val - min_val) / 2.0
        data_copy[col]=data_copy[col].apply(lambda x:((x - min_val) / scale) - 1.0)
    return data_copy
normalize_cols =['temperature','dew_point','Month','Day']
logarithm_cols=['wind_speed']
restrict_cols =['snow_time', 'rain_time']
def logarithm_dat(data,logarithm_cols):
    data_copy = data.copy()
    for col in logarithm_cols:
        data_copy[col]=np.log((data_copy[col])+1)
        
    return data_copy
def restrict_data(data,restrict_cols,min):
    data_copy = data.copy()
    for col in restrict_cols:
        data_copy[col] = np.minimum(data_copy[col],min)
    return data_copy


normalize_train = logarithm_dat(train_data,logarithm_cols)
normalize_train = restrict_data(normalize_train,restrict_cols,3)
normalize_train = preprocess_features(normalize_train).drop("pm2.5",axis=1)
normalize_train['pressure'] = normalize_train['pressure']/1000.0
X =linear_scale(normalize_train,normalize_cols) 
y = preprocess_targets(train_data)["pm2.5"]


In [0]:
def scale_test(data):
    data_copy = data.copy()
    for col,(max_val,min_val) in stored_scale.items():
        scale = (max_val - min_val) / 2.0
        data_copy[col]=data_copy[col].apply(lambda x:((x - min_val) / scale) - 1.0)
    return data_copy

In [0]:
normalize_test = logarithm_dat(test_data,logarithm_cols)
normalize_test = restrict_data(normalize_test,restrict_cols,3)
normalize_test = preprocess_features(normalize_test)
normalize_test['pressure'] = normalize_test['pressure']/1000.0
test_X =scale_test(normalize_test) 


In [191]:
normalize_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29757 entries, 0 to 29756
Data columns (total 23 columns):
hour                    29757 non-null int64
dew_point               29757 non-null int64
temperature             29757 non-null float64
pressure                29757 non-null float64
wind_speed              29757 non-null float64
snow_time               29757 non-null int64
rain_time               29757 non-null int64
wind_ne                 29757 non-null int64
wind_nw                 29757 non-null int64
wind_se                 29757 non-null int64
wind_cv                 29757 non-null int64
Month                   29757 non-null int64
Day                     29757 non-null int64
hourbin                 29757 non-null category
year_2010               29757 non-null uint8
year_2011               29757 non-null uint8
year_2012               29757 non-null uint8
year_2013               29757 non-null uint8
year_2014               29757 non-null uint8
Season_(0.989, 3.75]    297

In [220]:
print(preprocess_features(train_data).corr())

                             hour     pm2.5  dew_point  temperature  pressure  \
hour                     1.000000 -0.027619  -0.023740     0.148354 -0.045080   
pm2.5                   -0.027619  1.000000   0.161476    -0.094103 -0.038485   
dew_point               -0.023740  0.161476   1.000000     0.822720 -0.771440   
temperature              0.148354 -0.094103   0.822720     1.000000 -0.824131   
pressure                -0.045080 -0.038485  -0.771440    -0.824131  1.000000   
wind_speed               0.067023 -0.247125  -0.283657    -0.148315  0.173640   
snow_time               -0.002504  0.022895  -0.040184    -0.103848  0.076140   
rain_time               -0.008094 -0.057911   0.124639     0.046292 -0.083456   
wind_ne                 -0.063398 -0.035933  -0.030212    -0.055981  0.058472   
wind_nw                 -0.136038 -0.202180  -0.326668    -0.266245  0.226180   
wind_se                  0.216472  0.087972   0.266437     0.303565 -0.247223   
wind_cv                 -0.0

In [0]:
minimal=["dew_point","temperature","wind_speed","wind_nw","wind_se","wind_cv","Season_(0.989, 3.75]"]

In [239]:
rfc = RandomForestRegressor(max_features="auto",n_estimators=100)
rfc.fit(X_train[minimal],y_train)
rfc.score(X_test[minimal],y_test)

0.2934678112146306

In [168]:
X_train.describe()

Unnamed: 0,hour,dew_point,temperature,pressure,wind_speed,snow_time,rain_time,wind_ne,wind_nw,wind_se,...,year_2013,year_2014,"Season_(0.989, 3.75]","Season_(3.75, 6.5]","Season_(6.5, 9.25]","Season_(9.25, 12.0]","datebin_(-0.023, 5.75]","datebin_(5.75, 11.5]","datebin_(11.5, 17.25]","datebin_(17.25, 23.0]"
count,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,...,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0,22317.0
mean,11.525832,0.230232,0.047596,1.016503,2.176827,0.025586,0.09961,0.113008,0.317919,0.353049,...,0.206255,0.207331,0.247121,0.250168,0.247972,0.254739,0.24972,0.247748,0.251019,0.251512
std,6.927047,0.42095,0.405558,0.010235,1.319161,0.265275,0.501189,0.31661,0.465678,0.477929,...,0.404625,0.405404,0.431347,0.433119,0.431845,0.435724,0.432861,0.431715,0.433609,0.433892
min,0.0,-1.0,-1.0,0.992,0.371564,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,-0.117647,-0.3,1.008,1.026042,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,12.0,0.235294,0.1,1.016,1.851599,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,18.0,0.617647,0.4,1.025,3.131137,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
max,23.0,1.0,1.0,1.042,6.339459,3.0,3.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [0]:

from sklearn.cross_validation import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=33,test_size=0.25)


In [335]:
from sklearn.linear_model import SGDRegressor
sgdr = SGDRegressor()
sgdr.fit(X_train,y_train)
sgdr.score(X_test,y_test)



0.3094360080830917

In [336]:






from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
rfc = RandomForestRegressor()

param_grid = { 
    
    'n_estimators': [400],
    'max_features': ['auto']
}

CV_rfr = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3)
CV_rfr.fit(X, y)
print(CV_rfr.score(X_test,y_test))
print(CV_rfr.score(X_train,y_train))
print(CV_rfr.best_params_)

0.9785521785866266
0.976579097366061
{'max_features': 'auto', 'n_estimators': 400}


In [337]:
np.sum(CV_rfr.predict(X_test)-y_test)

4347.533416666667

In [306]:
X_test.head(5)

Unnamed: 0,hour,dew_point,temperature,pressure,wind_speed,snow_time,rain_time,wind_ne,wind_nw,wind_se,...,Day_22,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_30,Day_31
5488,7,-0.470588,-0.733333,1.034,3.028683,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
12685,9,-0.323529,-0.333333,1.03,1.026042,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
16700,14,0.205882,-0.1,1.02,1.519513,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28479,4,0.294118,-0.2,1.016,0.636577,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
12198,0,-0.147059,-0.6,1.024,1.418277,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [0]:
y_predict=CV_rfr.predict(test_X)

In [0]:
result = pd.DataFrame({"date":test_data["day"],"hour":test_data["hour"],"pm2.5":y_predict})

In [329]:
y_predict

array([52.935 , 82.785 , 26.375 , ..., 15.0925, 14.3925, 14.26  ])

In [292]:
y_predict1

array([ 55.59, 132.84,  41.11, ...,  28.97,  25.49,  27.4 ])

In [0]:
result.to_csv("resultrfr.csv",index=False)


In [0]:
y_predict1=y_predict

In [339]:
np.sum(y_predict1-y_predict)

7805.0964583333325

In [293]:

# Create GoogleDriveFile instance with title 'Hello.txt'.
file1 = drive.CreateFile({'title': 'resultrfr.csv'})
file1.SetContentString(result.to_csv(index=False))
file1.Upload() # Upload the file.
print('title: %s, id: %s' % (file1['title'], file1['id']))

title: resultrfr.csv, id: 1f-AuezezPczeIFVvwVs-6A3g1Qn3I_xc
