In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from time import time
import matplotlib.pyplot as plt

### Time feature data

In [2]:
model = GradientBoostingRegressor()

In [3]:
def return_data(fold,month,with_scaling):
    train_input = pd.read_csv('../data/beijing-18/time_feature'+'/fold'+str(fold)+'/train_data_'+month+'.csv.gz')
    test_input = pd.read_csv('../data/beijing-18/time_feature'+'/fold'+str(fold)+'/test_data_'+month+'.csv.gz')
    test_output = np.array(test_input['PM25_Concentration'])
    train_output = np.array(train_input['PM25_Concentration'])
    train_input= train_input.drop(['station_id','PM25_Concentration','time','filled'],axis=1)
    try:
        test_input= test_input.drop(['PM25_Concentration','station_id','time','filled'],axis=1)
    except:
        test_input= test_input.drop(['station_id','time','filled'],axis=1)
#     test_output= test_output.drop(['time'],axis=1)
    if with_scaling:
        scaler = MinMaxScaler().fit(train_input)
        train_input = scaler.transform(train_input)
        test_input = scaler.transform(test_input)
    return train_input,train_output,test_input,test_output

def run_model(model,train_input,train_output,test_input,test_output,ret_output):
    model.fit(np.array(train_input), train_output)
    test_pred = model.predict(np.array(test_input))
    err = mean_squared_error(test_pred, test_output, squared=False)
    mae = mean_absolute_error(test_pred, test_output)
    r2 = r2_score(test_pred, test_output)
    if ret_output:
        return (err, mae, r2), test_pred
    else:
        return (err, mae, r2)

### With scaling

In [4]:
mean_rmse, mean_mae, mean_r2 = 0, 0, 0

In [5]:
for fold in [0,1,2]:
    train_input,train_output,test_input,test_output = return_data(fold=fold,month='mar',with_scaling=True)
    print("Fold: ",fold)
    print("Data received")
    init = time()
    (rmse, mae, r2), test_pred = run_model(model,train_input,train_output,test_input,test_output,True)
    
    print("RMSE: ",rmse)
    print("MAE: ",mae)
    print("R2 score: ",r2)
    mean_rmse += rmse
    mean_mae += mae
    mean_r2 += r2
    print("Time taken: ",time()-init)

Fold:  0
Data received
RMSE:  33.71184258104653
MAE:  24.001388359784833
R2 score:  0.7417123694440926
Time taken:  1.4334113597869873
Fold:  1
Data received
RMSE:  34.137768565871006
MAE:  24.57827167837481
R2 score:  0.7124017942203692
Time taken:  1.5057954788208008
Fold:  2
Data received
RMSE:  38.73000201876054
MAE:  26.020552767505436
R2 score:  0.6326571556673242
Time taken:  1.442683458328247


In [6]:
np.array([mean_rmse, mean_mae, mean_r2]) / 3

array([35.52653772, 24.8667376 ,  0.69559044])