In [47]:
import pandas as pd
import numpy as np

%matplotlib inline
from matplotlib import pyplot as plt
plt.figure(figsize=(10,10))

import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split

<Figure size 720x720 with 0 Axes>

In [48]:
# def preprocess_data(data_path, labels_path=None):
    
#     x_features = ['reanalysis_specific_humidity_g_per_kg', 
#                  'reanalysis_dew_point_temp_k', 
#                  'station_avg_temp_c', 
#                  'station_min_temp_c']
#     y_features = ['total_cases']
    
    
#     x_df = pd.read_csv(data_path)
#     x_df.fillna(method='ffill', inplace=True)
#     sj_x = x_df[x_df['city']=='sj'][x_features]
#     iq_x = x_df[x_df['city']=='iq'][x_features]
    

#     # labels
#     sj_y = None
#     iq_y = None
#     if labels_path:
#         y_df = pd.read_csv(labels_path)
#         sj_y = y_df[y_df['city']=='sj'][y_features]
#         iq_y = y_df[y_df['city']=='iq'][y_features]

#     return sj_x, iq_x, sj_y, iq_y

In [49]:
# def preprocess_data_test(data_path, labels_path=None):
    
#     x_features = ['reanalysis_specific_humidity_g_per_kg', 
#                  'reanalysis_dew_point_temp_k', 
#                  'station_avg_temp_c', 
#                  'station_min_temp_c']
#     y_features = ['total_cases']
    
    
#     df = pd.read_csv(data_path)


#     # labels
#     if labels_path:
#         labels = pd.read_csv(labels_path)
#         df = df.join(labels)
        
#     #remove rows with any null values
#     df.dropna(inplace=True)
                
#     sj_x = df[df['city']=='sj'][x_features]
#     iq_x = df[df['city']=='iq'][x_features]

#     sj_y = df[df['city']=='sj'][y_features]
#     iq_y = df[df['city']=='iq'][y_features]


#     return sj_x, iq_x, sj_y, iq_y

In [50]:
def preprocess_data(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path, index_col=[0, 1, 2])
    
    # select features we want
    features = ['reanalysis_specific_humidity_g_per_kg', 
                 'reanalysis_dew_point_temp_k', 
                 'station_avg_temp_c', 
                 'station_min_temp_c']
    labels = ['total_cases']
    
    df = df[features]
    
    # fill missing values
    # df.fillna(method='ffill', inplace=True)

    # add labels to dataframe
    if labels_path:
        labels_df = pd.read_csv(labels_path, index_col=[0, 1, 2])
        df = df.join(labels_df)
    
    #remove rows with any null values
    df.dropna(inplace=True)
        
    # separate san juan and iquitos
    sj = df.loc['sj']
    iq = df.loc['iq']
    
    sj_x = sj[features]
    iq_x = iq[features]
    
    sj_y = None
    iq_y = None
    if labels_path:
        sj_y = sj[labels]
        iq_y = iq[labels]

    
    return sj_x, iq_x, sj_y, iq_y

In [51]:
def preprocess_data_test(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path, index_col=[0, 1, 2])
    
    # select features we want
    features = ['reanalysis_specific_humidity_g_per_kg', 
                 'reanalysis_dew_point_temp_k', 
                 'station_avg_temp_c', 
                 'station_min_temp_c']
    labels = ['total_cases']
    
    df = df[features]
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)

    # add labels to dataframe
    if labels_path:
        labels_df = pd.read_csv(labels_path, index_col=[0, 1, 2])
        df = df.join(labels_df)
    
    # remove rows with any null values
    #df.dropna(inplace=True)
        
    # separate san juan and iquitos
    sj = df.loc['sj']
    iq = df.loc['iq']
    
    sj_x = sj[features]
    iq_x = iq[features]
    
    sj_y = None
    iq_y = None
    if labels_path:
        sj_y = sj[labels]
        iq_y = iq[labels]

    
    return sj_x, iq_x, sj_y, iq_y

In [52]:
sj_x, iq_x, sj_y, iq_y = preprocess_data('./data-processed/dengue_features_train.csv', labels_path='./data-processed/dengue_labels_train.csv')

In [53]:
print('sj_x - ', sj_x.shape)
print('sj_y - ', sj_y.shape)
print('iq_x - ', iq_x.shape)
print('iq_y - ', iq_y.shape)

sj_x -  (930, 4)
sj_y -  (930, 1)
iq_x -  (483, 4)
iq_y -  (483, 1)


In [54]:
learning_rates = [ 0.05, 0.01, 0.1, 0.2, 0.25]
colsample_bytree = [0.6, 0.7, 0.8, 0.9, 0.95]
n_estimators = [5, 10, 15, 30, 40, 45]
max_depth = [3, 4, 5, 9, 10, 11]

def hyper_param_optimize(X, Y):
    
    lr_best = learning_rates[0]
    cs_best = colsample_bytree[0]
    es_best = n_estimators[0]
    md_best = max_depth[0]
    
    best_mse = float("inf")
    counter = 0
    
    for lr in learning_rates:
        for cs in colsample_bytree:
            for es in n_estimators:
                for md in max_depth:
                    counter += 1
                    print('step', counter, 'lr', lr, 'cs', cs, 'es', es, 'md', md)
                    
                    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
                    
                    model = xgb.XGBRegressor(
                        objective ='reg:linear',
                        colsample_bytree = cs, learning_rate = lr, max_depth = md, n_estimators = es
                    )
                    model.fit(x_train, y_train)
                    y_predict = model.predict(x_test)
                    
                    mse = mean_squared_error(y_test, y_predict)**0.5
                    
                    if (mse < best_mse):
                        best_mse = mse
                        lr_best = lr
                        cs_best = cs
                        es_best = es
                        md_best = md
    print('** best model - lr', lr_best, 'cs', cs_best, 'es', es_best, 'md', md_best)
    print('** best MSE - ', best_mse)
    
    return {'learning_rate': lr_best, 'colsample_bytree': cs_best, 'max_depth': md_best, 'n_estimators': es_best}

In [55]:
sj_param = hyper_param_optimize(sj_x, sj_y)

step 1 lr 0.05 cs 0.6 es 5 md 3
step 2 lr 0.05 cs 0.6 es 5 md 4
step 3 lr 0.05 cs 0.6 es 5 md 5
step 4 lr 0.05 cs 0.6 es 5 md 9
step 5 lr 0.05 cs 0.6 es 5 md 10
step 6 lr 0.05 cs 0.6 es 5 md 11
step 7 lr 0.05 cs 0.6 es 10 md 3
step 8 lr 0.05 cs 0.6 es 10 md 4
step 9 lr 0.05 cs 0.6 es 10 md 5
step 10 lr 0.05 cs 0.6 es 10 md 9
step 11 lr 0.05 cs 0.6 es 10 md 10
step 12 lr 0.05 cs 0.6 es 10 md 11
step 13 lr 0.05 cs 0.6 es 15 md 3
step 14 lr 0.05 cs 0.6 es 15 md 4
step 15 lr 0.05 cs 0.6 es 15 md 5
step 16 lr 0.05 cs 0.6 es 15 md 9
step 17 lr 0.05 cs 0.6 es 15 md 10
step 18 lr 0.05 cs 0.6 es 15 md 11
step 19 lr 0.05 cs 0.6 es 30 md 3
step 20 lr 0.05 cs 0.6 es 30 md 4
step 21 lr 0.05 cs 0.6 es 30 md 5
step 22 lr 0.05 cs 0.6 es 30 md 9
step 23 lr 0.05 cs 0.6 es 30 md 10
step 24 lr 0.05 cs 0.6 es 30 md 11
step 25 lr 0.05 cs 0.6 es 40 md 3
step 26 lr 0.05 cs 0.6 es 40 md 4
step 27 lr 0.05 cs 0.6 es 40 md 5
step 28 lr 0.05 cs 0.6 es 40 md 9
step 29 lr 0.05 cs 0.6 es 40 md 10
step 30 lr 0.05 cs 0

step 240 lr 0.01 cs 0.7 es 30 md 11
step 241 lr 0.01 cs 0.7 es 40 md 3
step 242 lr 0.01 cs 0.7 es 40 md 4
step 243 lr 0.01 cs 0.7 es 40 md 5
step 244 lr 0.01 cs 0.7 es 40 md 9
step 245 lr 0.01 cs 0.7 es 40 md 10
step 246 lr 0.01 cs 0.7 es 40 md 11
step 247 lr 0.01 cs 0.7 es 45 md 3
step 248 lr 0.01 cs 0.7 es 45 md 4
step 249 lr 0.01 cs 0.7 es 45 md 5
step 250 lr 0.01 cs 0.7 es 45 md 9
step 251 lr 0.01 cs 0.7 es 45 md 10
step 252 lr 0.01 cs 0.7 es 45 md 11
step 253 lr 0.01 cs 0.8 es 5 md 3
step 254 lr 0.01 cs 0.8 es 5 md 4
step 255 lr 0.01 cs 0.8 es 5 md 5
step 256 lr 0.01 cs 0.8 es 5 md 9
step 257 lr 0.01 cs 0.8 es 5 md 10
step 258 lr 0.01 cs 0.8 es 5 md 11
step 259 lr 0.01 cs 0.8 es 10 md 3
step 260 lr 0.01 cs 0.8 es 10 md 4
step 261 lr 0.01 cs 0.8 es 10 md 5
step 262 lr 0.01 cs 0.8 es 10 md 9
step 263 lr 0.01 cs 0.8 es 10 md 10
step 264 lr 0.01 cs 0.8 es 10 md 11
step 265 lr 0.01 cs 0.8 es 15 md 3
step 266 lr 0.01 cs 0.8 es 15 md 4
step 267 lr 0.01 cs 0.8 es 15 md 5
step 268 lr 0.01 

step 486 lr 0.1 cs 0.9 es 15 md 11
step 487 lr 0.1 cs 0.9 es 30 md 3
step 488 lr 0.1 cs 0.9 es 30 md 4
step 489 lr 0.1 cs 0.9 es 30 md 5
step 490 lr 0.1 cs 0.9 es 30 md 9
step 491 lr 0.1 cs 0.9 es 30 md 10
step 492 lr 0.1 cs 0.9 es 30 md 11
step 493 lr 0.1 cs 0.9 es 40 md 3
step 494 lr 0.1 cs 0.9 es 40 md 4
step 495 lr 0.1 cs 0.9 es 40 md 5
step 496 lr 0.1 cs 0.9 es 40 md 9
step 497 lr 0.1 cs 0.9 es 40 md 10
step 498 lr 0.1 cs 0.9 es 40 md 11
step 499 lr 0.1 cs 0.9 es 45 md 3
step 500 lr 0.1 cs 0.9 es 45 md 4
step 501 lr 0.1 cs 0.9 es 45 md 5
step 502 lr 0.1 cs 0.9 es 45 md 9
step 503 lr 0.1 cs 0.9 es 45 md 10
step 504 lr 0.1 cs 0.9 es 45 md 11
step 505 lr 0.1 cs 0.95 es 5 md 3
step 506 lr 0.1 cs 0.95 es 5 md 4
step 507 lr 0.1 cs 0.95 es 5 md 5
step 508 lr 0.1 cs 0.95 es 5 md 9
step 509 lr 0.1 cs 0.95 es 5 md 10
step 510 lr 0.1 cs 0.95 es 5 md 11
step 511 lr 0.1 cs 0.95 es 10 md 3
step 512 lr 0.1 cs 0.95 es 10 md 4
step 513 lr 0.1 cs 0.95 es 10 md 5
step 514 lr 0.1 cs 0.95 es 10 md 9
s

step 733 lr 0.25 cs 0.6 es 15 md 3
step 734 lr 0.25 cs 0.6 es 15 md 4
step 735 lr 0.25 cs 0.6 es 15 md 5
step 736 lr 0.25 cs 0.6 es 15 md 9
step 737 lr 0.25 cs 0.6 es 15 md 10
step 738 lr 0.25 cs 0.6 es 15 md 11
step 739 lr 0.25 cs 0.6 es 30 md 3
step 740 lr 0.25 cs 0.6 es 30 md 4
step 741 lr 0.25 cs 0.6 es 30 md 5
step 742 lr 0.25 cs 0.6 es 30 md 9
step 743 lr 0.25 cs 0.6 es 30 md 10
step 744 lr 0.25 cs 0.6 es 30 md 11
step 745 lr 0.25 cs 0.6 es 40 md 3
step 746 lr 0.25 cs 0.6 es 40 md 4
step 747 lr 0.25 cs 0.6 es 40 md 5
step 748 lr 0.25 cs 0.6 es 40 md 9
step 749 lr 0.25 cs 0.6 es 40 md 10
step 750 lr 0.25 cs 0.6 es 40 md 11
step 751 lr 0.25 cs 0.6 es 45 md 3
step 752 lr 0.25 cs 0.6 es 45 md 4
step 753 lr 0.25 cs 0.6 es 45 md 5
step 754 lr 0.25 cs 0.6 es 45 md 9
step 755 lr 0.25 cs 0.6 es 45 md 10
step 756 lr 0.25 cs 0.6 es 45 md 11
step 757 lr 0.25 cs 0.7 es 5 md 3
step 758 lr 0.25 cs 0.7 es 5 md 4
step 759 lr 0.25 cs 0.7 es 5 md 5
step 760 lr 0.25 cs 0.7 es 5 md 9
step 761 lr 0.25

In [56]:
iq_param = hyper_param_optimize(iq_x, iq_y)

step 1 lr 0.05 cs 0.6 es 5 md 3
step 2 lr 0.05 cs 0.6 es 5 md 4
step 3 lr 0.05 cs 0.6 es 5 md 5
step 4 lr 0.05 cs 0.6 es 5 md 9
step 5 lr 0.05 cs 0.6 es 5 md 10
step 6 lr 0.05 cs 0.6 es 5 md 11
step 7 lr 0.05 cs 0.6 es 10 md 3
step 8 lr 0.05 cs 0.6 es 10 md 4
step 9 lr 0.05 cs 0.6 es 10 md 5
step 10 lr 0.05 cs 0.6 es 10 md 9
step 11 lr 0.05 cs 0.6 es 10 md 10
step 12 lr 0.05 cs 0.6 es 10 md 11
step 13 lr 0.05 cs 0.6 es 15 md 3
step 14 lr 0.05 cs 0.6 es 15 md 4
step 15 lr 0.05 cs 0.6 es 15 md 5
step 16 lr 0.05 cs 0.6 es 15 md 9
step 17 lr 0.05 cs 0.6 es 15 md 10
step 18 lr 0.05 cs 0.6 es 15 md 11
step 19 lr 0.05 cs 0.6 es 30 md 3
step 20 lr 0.05 cs 0.6 es 30 md 4
step 21 lr 0.05 cs 0.6 es 30 md 5
step 22 lr 0.05 cs 0.6 es 30 md 9
step 23 lr 0.05 cs 0.6 es 30 md 10
step 24 lr 0.05 cs 0.6 es 30 md 11
step 25 lr 0.05 cs 0.6 es 40 md 3
step 26 lr 0.05 cs 0.6 es 40 md 4
step 27 lr 0.05 cs 0.6 es 40 md 5
step 28 lr 0.05 cs 0.6 es 40 md 9
step 29 lr 0.05 cs 0.6 es 40 md 10
step 30 lr 0.05 cs 0

step 243 lr 0.01 cs 0.7 es 40 md 5
step 244 lr 0.01 cs 0.7 es 40 md 9
step 245 lr 0.01 cs 0.7 es 40 md 10
step 246 lr 0.01 cs 0.7 es 40 md 11
step 247 lr 0.01 cs 0.7 es 45 md 3
step 248 lr 0.01 cs 0.7 es 45 md 4
step 249 lr 0.01 cs 0.7 es 45 md 5
step 250 lr 0.01 cs 0.7 es 45 md 9
step 251 lr 0.01 cs 0.7 es 45 md 10
step 252 lr 0.01 cs 0.7 es 45 md 11
step 253 lr 0.01 cs 0.8 es 5 md 3
step 254 lr 0.01 cs 0.8 es 5 md 4
step 255 lr 0.01 cs 0.8 es 5 md 5
step 256 lr 0.01 cs 0.8 es 5 md 9
step 257 lr 0.01 cs 0.8 es 5 md 10
step 258 lr 0.01 cs 0.8 es 5 md 11
step 259 lr 0.01 cs 0.8 es 10 md 3
step 260 lr 0.01 cs 0.8 es 10 md 4
step 261 lr 0.01 cs 0.8 es 10 md 5
step 262 lr 0.01 cs 0.8 es 10 md 9
step 263 lr 0.01 cs 0.8 es 10 md 10
step 264 lr 0.01 cs 0.8 es 10 md 11
step 265 lr 0.01 cs 0.8 es 15 md 3
step 266 lr 0.01 cs 0.8 es 15 md 4
step 267 lr 0.01 cs 0.8 es 15 md 5
step 268 lr 0.01 cs 0.8 es 15 md 9
step 269 lr 0.01 cs 0.8 es 15 md 10
step 270 lr 0.01 cs 0.8 es 15 md 11
step 271 lr 0.01

step 487 lr 0.1 cs 0.9 es 30 md 3
step 488 lr 0.1 cs 0.9 es 30 md 4
step 489 lr 0.1 cs 0.9 es 30 md 5
step 490 lr 0.1 cs 0.9 es 30 md 9
step 491 lr 0.1 cs 0.9 es 30 md 10
step 492 lr 0.1 cs 0.9 es 30 md 11
step 493 lr 0.1 cs 0.9 es 40 md 3
step 494 lr 0.1 cs 0.9 es 40 md 4
step 495 lr 0.1 cs 0.9 es 40 md 5
step 496 lr 0.1 cs 0.9 es 40 md 9
step 497 lr 0.1 cs 0.9 es 40 md 10
step 498 lr 0.1 cs 0.9 es 40 md 11
step 499 lr 0.1 cs 0.9 es 45 md 3
step 500 lr 0.1 cs 0.9 es 45 md 4
step 501 lr 0.1 cs 0.9 es 45 md 5
step 502 lr 0.1 cs 0.9 es 45 md 9
step 503 lr 0.1 cs 0.9 es 45 md 10
step 504 lr 0.1 cs 0.9 es 45 md 11
step 505 lr 0.1 cs 0.95 es 5 md 3
step 506 lr 0.1 cs 0.95 es 5 md 4
step 507 lr 0.1 cs 0.95 es 5 md 5
step 508 lr 0.1 cs 0.95 es 5 md 9
step 509 lr 0.1 cs 0.95 es 5 md 10
step 510 lr 0.1 cs 0.95 es 5 md 11
step 511 lr 0.1 cs 0.95 es 10 md 3
step 512 lr 0.1 cs 0.95 es 10 md 4
step 513 lr 0.1 cs 0.95 es 10 md 5
step 514 lr 0.1 cs 0.95 es 10 md 9
step 515 lr 0.1 cs 0.95 es 10 md 10


step 733 lr 0.25 cs 0.6 es 15 md 3
step 734 lr 0.25 cs 0.6 es 15 md 4
step 735 lr 0.25 cs 0.6 es 15 md 5
step 736 lr 0.25 cs 0.6 es 15 md 9
step 737 lr 0.25 cs 0.6 es 15 md 10
step 738 lr 0.25 cs 0.6 es 15 md 11
step 739 lr 0.25 cs 0.6 es 30 md 3
step 740 lr 0.25 cs 0.6 es 30 md 4
step 741 lr 0.25 cs 0.6 es 30 md 5
step 742 lr 0.25 cs 0.6 es 30 md 9
step 743 lr 0.25 cs 0.6 es 30 md 10
step 744 lr 0.25 cs 0.6 es 30 md 11
step 745 lr 0.25 cs 0.6 es 40 md 3
step 746 lr 0.25 cs 0.6 es 40 md 4
step 747 lr 0.25 cs 0.6 es 40 md 5
step 748 lr 0.25 cs 0.6 es 40 md 9
step 749 lr 0.25 cs 0.6 es 40 md 10
step 750 lr 0.25 cs 0.6 es 40 md 11
step 751 lr 0.25 cs 0.6 es 45 md 3
step 752 lr 0.25 cs 0.6 es 45 md 4
step 753 lr 0.25 cs 0.6 es 45 md 5
step 754 lr 0.25 cs 0.6 es 45 md 9
step 755 lr 0.25 cs 0.6 es 45 md 10
step 756 lr 0.25 cs 0.6 es 45 md 11
step 757 lr 0.25 cs 0.7 es 5 md 3
step 758 lr 0.25 cs 0.7 es 5 md 4
step 759 lr 0.25 cs 0.7 es 5 md 5
step 760 lr 0.25 cs 0.7 es 5 md 9
step 761 lr 0.25

In [57]:
sj_test, iq_test, _, _ = preprocess_data_test('./data-processed/dengue_features_test.csv')

UnboundLocalError: local variable 'sj_y' referenced before assignment

In [None]:
sj_model = xgb.XGBRegressor(
    objective ='reg:linear',
    colsample_bytree = sj_param['colsample_bytree'], 
    learning_rate = sj_param['learning_rate'], 
    max_depth = sj_param['max_depth'], 
    n_estimators = sj_param['n_estimators']
)
iq_model = xgb.XGBRegressor(
    objective ='reg:linear',
    colsample_bytree = iq_param['colsample_bytree'], 
    learning_rate = iq_param['learning_rate'], 
    max_depth = iq_param['max_depth'], 
    n_estimators = iq_param['n_estimators']
)
sj_model.fit(sj_x, sj_y)
iq_model.fit(iq_x, iq_y)

sj_predict = sj_model.predict(sj_test)
iq_predict = iq_model.predict(iq_test)

In [13]:
test_df = pd.read_csv('./data-processed/dengue_features_test.csv')[['city', 'year', 'weekofyear']]

total_cases = pd.concat([pd.Series(sj_predict), pd.Series(iq_predict)])

total_cases.index = test_df.index
test_df['total_cases'] = total_cases.astype(int)

test_df.to_csv('./data-processed/benchmark.csv', index=False)

In [14]:
test_df['total_cases'].unique()

array([ 23,  19,  16,  20,  11,  61,  25,  41,  32,  34,  39,  31,  76,
        50,  30,  38,  37,  51,  59,  35,  36,  27,  40,  15,  21,  22,
        13,  12,  14,  77,  48,  33,  29,  44,  43,  57,  45,  24,  17,
        66,  28,  54,  42,  46,  72, 144,  65, 146, 138,  49,  56,  83,
        63,  68,  55,  71,  18,  58,   6,   8,   4,   1,   3,   2,   5,
        10,   9,   7], dtype=int64)