In [3]:
import pandas as pd

import seaborn as sb

from sklearn import linear_model, tree, ensemble, model_selection, metrics

from xgboost import XGBRFRegressor

In [4]:
df_submission = pd.read_csv('./data/submission_format.csv')
train_labels = pd.read_csv('./data/dengue_labels_train.csv')
train = pd.read_csv('./data/dengue_features_train.csv')
test = pd.read_csv('./data/dengue_features_test.csv')

In [5]:
def prepare_data():

    df_train = pd.merge(train, train_labels)
    df_train = df_train.drop('week_start_date', axis = 1)
    df_train = df_train.fillna(method = 'ffill', axis = 1)

    df_test = test
    df_test = df_test.drop('week_start_date', axis = 1)
    df_test = df_test.fillna(method = 'ffill', axis = 1)

    return df_train, df_test

In [6]:
df_train, df_test = prepare_data()

In [7]:
df_train.head()

Unnamed: 0,city,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,sj,1990,18,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,...,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4
1,sj,1990,19,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,...,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5
2,sj,1990,20,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,...,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4
3,sj,1990,21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,...,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3
4,sj,1990,22,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,...,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6


In [8]:
df_train['city'].value_counts()

sj    936
iq    520
Name: city, dtype: int64

In [9]:
def split_and_train(df, generic_model):

    df_sj = df[df['city'] == 'sj'].drop('city', axis = 1)
    df_iq = df[df['city'] == 'iq'].drop('city', axis = 1)

    #training SJ
    X_train_sj, X_test_sj, y_train_sj, y_test_sj = model_selection.train_test_split(df_sj.drop('total_cases', axis = 1), df_sj['total_cases'])
    reg_sj = generic_model.fit(X_train_sj, y_train_sj)

    print('r2 for SJ : ' + str(metrics.r2_score(y_test_sj, reg_sj.predict(X_test_sj))))
    print('MSE for SJ : ' + str(metrics.mean_squared_error(y_test_sj, reg_sj.predict(X_test_sj))))

    #training IQ
    X_train_iq, X_test_iq, y_train_iq, y_test_iq = model_selection.train_test_split(df_iq.drop('total_cases', axis = 1), df_iq['total_cases'])
    reg_iq = generic_model.fit(X_train_iq, y_train_iq)

    print('r2 for IQ : ' + str(metrics.r2_score(y_test_iq, reg_sj.predict(X_test_iq))))
    print('MSE for IQ : ' + str(metrics.mean_squared_error(y_test_iq, reg_sj.predict(X_test_iq))))

    return reg_sj, reg_iq

In [10]:
reg_sj, reg_iq = split_and_train(df_train, linear_model.LinearRegression())

r2 for SJ : 0.579738832722028
MSE for SJ : 1127.963880388433
r2 for IQ : -0.13733779161226445
MSE for IQ : 122.76578908279392


In [11]:
reg_iq