In [1]:
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import statsmodels.formula.api as smf # linear modeling
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

# Load data
data_features = pd.read_csv('./data/dengue_features_train.csv')
data_labels = pd.read_csv('./data/dengue_labels_train.csv')
data_test = pd.read_csv('./data/dengue_features_test.csv')

In [2]:
df = pd.DataFrame(columns=['Weeks', 'Error'])

In [3]:
for n in range(50, 107):
    
    ###################################################
    # Data Prep
    ###################################################
    
    #Features
    data_features_iq = data_features[data_features.city=='sj'].reset_index(drop = True)
    data_labels_iq = data_labels[data_labels.city=='sj'].reset_index(drop = True)
    data_iq = pd.merge(data_features_iq, data_labels_iq)
    data_iq = data_iq.drop(columns=['reanalysis_sat_precip_amt_mm', 'reanalysis_specific_humidity_g_per_kg'])

    data_iq = data_iq.fillna(method = 'ffill')
    data_iq['month'] = pd.to_datetime(data_iq['week_start_date']).dt.month
    data_iq['odd_year'] = data_iq.year.astype('int64') % 2 == 1
    data_iq['ndvi_mean'] = (data_iq['ndvi_ne'] + data_iq['ndvi_nw'] + data_iq['ndvi_se'] + data_iq['ndvi_sw']) / 4.0

    # data_iq = data_iq[data_iq['year'].isin(['1990','1991','1992','1993','1995','1996','1997','1999','2000','2001','2002','2003','2004','2005','2006','2007','2008'])]

    data_iq['ndvi_mean_rolling_avg'] = data_iq['ndvi_mean'].rolling(window = n).mean()
    data_iq['ndvi_ne_rolling_avg'] = data_iq['ndvi_ne'].rolling(window = n).mean()
    data_iq['ndvi_nw_rolling_avg'] = data_iq['ndvi_nw'].rolling(window = n).mean()
    data_iq['ndvi_se_rolling_avg'] = data_iq['ndvi_se'].rolling(window = n).mean()
    data_iq['ndvi_sw_rolling_avg'] = data_iq['ndvi_sw'].rolling(window = n).mean()
    data_iq['precipitation_amt_mm_rolling_avg'] = data_iq['precipitation_amt_mm'].rolling(window = n).mean()
    data_iq['reanalysis_air_temp_k_rolling_avg'] = data_iq['reanalysis_air_temp_k'].rolling(window = n).mean()
    data_iq['reanalysis_avg_temp_k_rolling_avg'] = data_iq['reanalysis_avg_temp_k'].rolling(window = n).mean()
    data_iq['reanalysis_dew_point_temp_k_rolling_avg'] = data_iq['reanalysis_dew_point_temp_k'].rolling(window = n).mean()
    data_iq['reanalysis_max_air_temp_k_rolling_avg'] = data_iq['reanalysis_max_air_temp_k'].rolling(window = n).mean()
    data_iq['reanalysis_min_air_temp_k_rolling_avg'] = data_iq['reanalysis_min_air_temp_k'].rolling(window = n).mean()
    data_iq['reanalysis_precip_amt_kg_per_m2_rolling_avg'] = data_iq['reanalysis_precip_amt_kg_per_m2'].rolling(window = n).mean()
    data_iq['reanalysis_relative_humidity_percent_rolling_avg'] = data_iq['reanalysis_relative_humidity_percent'].rolling(window = n).mean()
    data_iq['reanalysis_tdtr_k_rolling_avg'] = data_iq['reanalysis_tdtr_k'].rolling(window = n).mean()
    data_iq['station_avg_temp_c_rolling_avg'] = data_iq['station_avg_temp_c'].rolling(window = n).mean()
    data_iq['station_diur_temp_rng_c_rolling_avg'] = data_iq['station_diur_temp_rng_c'].rolling(window = n).mean()
    data_iq['station_max_temp_c_rolling_avg'] = data_iq['station_max_temp_c'].rolling(window = n).mean()
    data_iq['station_min_temp_c_rolling_avg'] = data_iq['station_min_temp_c'].rolling(window = n).mean()
    data_iq['station_precip_mm_rolling_avg'] = data_iq['station_precip_mm'].rolling(window = n).mean()


    data_iq['week_start_date'] = pd.to_datetime(data_iq['week_start_date']).dt.month


    data_iq.drop(data_iq.head(n).index, inplace=True)

    # data_labels_iq_year = data_labels_iq.groupby(['year'])['total_cases'].stdev().to_frame(name = 'annual_cases').reset_index()
    # data_labels_iq = pd.merge(data_labels_iq, data_labels_iq_year)
    # data_labels_iq['pct_cases_year'] = data_labels_iq['total_cases'] / data_labels_iq['annual_cases']


    # Features - Normalize
    data_iq_n = MinMaxScaler().fit_transform(data_iq[data_iq.columns[4:43]])
    data_iq_n = pd.DataFrame(data_iq_n, columns = data_iq.columns[4:43], index=data_iq.index)
    data_iq_n = data_iq_n.drop(columns=['total_cases'])
    data_iq_n['month'] = data_iq['month']
    data_iq_n['odd_year'] = data_iq['odd_year']


    # Features Test
    data_iq_lastnweek = data_features[data_features.city=='sj'].reset_index(drop = True).tail(n)
    data_test_iq = data_test[data_test.city=='sj'].reset_index(drop = True)
    frames = [data_iq_lastnweek, data_test_iq]
    data_test_iq = pd.concat(frames).reset_index(drop = True)

    data_test_iq = data_test_iq.drop(columns=['reanalysis_sat_precip_amt_mm', 'reanalysis_specific_humidity_g_per_kg'])
    data_test_iq = data_test_iq.fillna(method = 'ffill')
    data_test_iq['month'] = pd.to_datetime(data_test_iq['week_start_date']).dt.month
    data_test_iq['odd_year'] = data_test_iq.year.astype('int64') % 2 == 1
    data_test_iq['ndvi_mean'] = (data_test_iq['ndvi_ne'] + data_test_iq['ndvi_nw'] + data_test_iq['ndvi_se'] + data_test_iq['ndvi_sw']) / 4.0

    data_test_iq['ndvi_mean_rolling_avg'] = data_test_iq['ndvi_mean'].rolling(window = n).mean()
    data_test_iq['ndvi_ne_rolling_avg'] = data_test_iq['ndvi_ne'].rolling(window = n).mean()
    data_test_iq['ndvi_nw_rolling_avg'] = data_test_iq['ndvi_nw'].rolling(window = n).mean()
    data_test_iq['ndvi_se_rolling_avg'] = data_test_iq['ndvi_se'].rolling(window = n).mean()
    data_test_iq['ndvi_sw_rolling_avg'] = data_test_iq['ndvi_sw'].rolling(window = n).mean()
    data_test_iq['precipitation_amt_mm_rolling_avg'] = data_test_iq['precipitation_amt_mm'].rolling(window = n).mean()
    data_test_iq['reanalysis_air_temp_k_rolling_avg'] = data_test_iq['reanalysis_air_temp_k'].rolling(window = n).mean()
    data_test_iq['reanalysis_avg_temp_k_rolling_avg'] = data_test_iq['reanalysis_avg_temp_k'].rolling(window = n).mean()
    data_test_iq['reanalysis_dew_point_temp_k_rolling_avg'] = data_test_iq['reanalysis_dew_point_temp_k'].rolling(window = n).mean()
    data_test_iq['reanalysis_max_air_temp_k_rolling_avg'] = data_test_iq['reanalysis_max_air_temp_k'].rolling(window = n).mean()
    data_test_iq['reanalysis_min_air_temp_k_rolling_avg'] = data_test_iq['reanalysis_min_air_temp_k'].rolling(window = n).mean()
    data_test_iq['reanalysis_precip_amt_kg_per_m2_rolling_avg'] = data_test_iq['reanalysis_precip_amt_kg_per_m2'].rolling(window = n).mean()
    data_test_iq['reanalysis_relative_humidity_percent_rolling_avg'] = data_test_iq['reanalysis_relative_humidity_percent'].rolling(window = n).mean()
    data_test_iq['reanalysis_tdtr_k_rolling_avg'] = data_test_iq['reanalysis_tdtr_k'].rolling(window = n).mean()
    data_test_iq['station_avg_temp_c_rolling_avg'] = data_test_iq['station_avg_temp_c'].rolling(window = n).mean()
    data_test_iq['station_diur_temp_rng_c_rolling_avg'] = data_test_iq['station_diur_temp_rng_c'].rolling(window = n).mean()
    data_test_iq['station_max_temp_c_rolling_avg'] = data_test_iq['station_max_temp_c'].rolling(window = n).mean()
    data_test_iq['station_min_temp_c_rolling_avg'] = data_test_iq['station_min_temp_c'].rolling(window = n).mean()
    data_test_iq['station_precip_mm_rolling_avg'] = data_test_iq['station_precip_mm'].rolling(window = n).mean()

    data_test_iq.drop(data_test_iq.head(n).index, inplace=True)

    # Features Test - Normalized
    data_test_iq_n = MinMaxScaler().fit_transform(data_test_iq[data_test_iq.columns[4:43]])
    data_test_iq_n = pd.DataFrame(data_test_iq_n, columns = data_test_iq.columns[4:43], index=data_test_iq.index)
    data_test_iq_n['month'] = data_test_iq['month']
    data_test_iq_n['odd_year'] = data_test_iq['odd_year']
    
    
    ###############################################################
    # Training Data
    ###############################################################

    train_features_iq, test_features_iq, train_outcome_iq, test_outcome_iq = train_test_split(
        data_iq_n[['month', 'odd_year', 'ndvi_sw_rolling_avg',
           'precipitation_amt_mm_rolling_avg',
           'reanalysis_dew_point_temp_k_rolling_avg',
           'reanalysis_precip_amt_kg_per_m2_rolling_avg',
           'reanalysis_relative_humidity_percent_rolling_avg',
           'station_diur_temp_rng_c_rolling_avg',
           'station_max_temp_c_rolling_avg']],
        data_iq['total_cases'],
        test_size = 0.3
    )
    
    ###############################################################
    # Grid Search
    ###############################################################
    
    params = {'n_neighbors':range(1, 50), 'weights':['uniform', 'distance']}
    folds = KFold(n_splits = 10, shuffle=True)
    grid_search = GridSearchCV(KNeighborsRegressor(), param_grid=params, cv=folds, scoring='neg_mean_absolute_error')
    grid_search.fit(train_features_iq, train_outcome_iq)
    
    grid_search.score(test_features_iq, test_outcome_iq)
    grid_search.cv_results_['params'][grid_search.best_index_]
    
    knr_reg = KNeighborsRegressor(n_neighbors = 5, weights = 'distance')
    knr_preds_iq = knr_reg.fit(train_features_iq, train_outcome_iq).predict(test_features_iq)
    
    print("done")
    #### ###################################
    #Adding to the DataFrame
    
    error = mean_absolute_error(test_outcome_iq, knr_preds_iq)
    print(n)
    print(error)

done
50
8.69891879839679
done
51
8.390135280315942
done
52
8.459309671091683
done
53
8.53747294307873
done
54
8.270487230797796
done
55
9.888118743512683
done
56
9.394894764547217
done
57
8.472587219221111
done
58
8.44378285541248
done
59
9.274486468596193
done
60
8.79386520049468
done
61
9.076113346557108
done
62
8.169555499685845
done
63
9.95443378545255
done
64
8.696496581574232
done
65
8.151893151203295
done
66
9.353318355670188
done
67
10.669718522232113
done
68
9.91190450069965
done
69
8.841986275641945
done
70
9.02533105811282
done
71
11.971891534720084
done
72
8.135757829321673
done
73
8.605491470420336
done
74
8.176972226111873
done
75
10.21217123238709
done
76
8.261927394911398
done
77
8.739052074486047
done
78
9.41188427912122
done
79
11.485764020967617
done
80
7.556257309080113
done
81
9.822179464023295
done
82
9.221126574735807
done
83
7.139809092139034
done
84
9.275780497603098
done
85
8.51980187003053
done
86
7.721438410138937
done
87
9.437124469679052
done
88
7.58793745