In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
training_records = pd.read_csv('../dengue_features_train.csv')
testing_records = pd.read_csv('../dengue_features_test.csv')

In [None]:
training_records.columns

In [None]:
training_records.describe()

In [None]:
testing_records.describe()

In [None]:
training_records.groupby('city').mean()

In [None]:
testing_records.groupby('city').mean()

Looks like data for the two cities are very different but the trends remain across the test and train datasets. 

In [None]:
training_records_sj = training_records[training_records['city'] == 'sj'].drop('city', 1)
training_records_iq = training_records[training_records['city'] == 'iq'].drop('city', 1)
testing_records_sj = testing_records[testing_records['city'] == 'sj'].drop('city', 1)
testing_records_iq = testing_records[testing_records['city'] == 'iq'].drop('city', 1)

In [None]:
training_records_iq.tail()

In [None]:
testing_records_iq.head()

In [None]:
training_records_sj.tail()

In [None]:
testing_records_sj.head()

It looks like train dataset and the test dataset can be combined for the purpose of smoothing, outlier finding etc. 

In [None]:
(len(training_records_iq), len(testing_records_iq))

In [None]:
(len(training_records_sj), len(testing_records_sj))

In [None]:
records_iq = pd.concat([training_records_iq, testing_records_iq], ignore_index=True)
records_sj = pd.concat([training_records_sj, testing_records_sj], ignore_index=True)

In [None]:
(records_iq.drop(['year', 'weekofyear'], axis=1)
    .plot(kind='kde', subplots=True, figsize=(14, 30), layout=(10, 2), sharex=False))

In [None]:
(records_sj.drop(['year', 'weekofyear'], axis=1)
    .plot(kind='kde', subplots=True, figsize=(14, 30), layout=(10, 2), sharex=False))

In [None]:
corr_iq = records_iq.corr()
corr_sj = records_sj.corr()

In [None]:
plt.figure(figsize=(8, 6))
corr_heatmap_iq = sns.heatmap(corr_iq)
plt.title('Iquitos correlations')

In [None]:
plt.figure(figsize=(8, 6))
corr_heatmap_sj = sns.heatmap(corr_sj)
plt.title('San Juan correlations')

In [None]:
# Select upper triangle of correlation matrix
corr_upper_iq = corr_iq.where(np.triu(np.ones(corr_iq.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop_iq = [column for column in corr_upper_iq.columns if any(corr_upper_iq[column] > 0.95) or any(corr_upper_iq[column] < -0.95)]
to_drop_iq

In [None]:
corr_upper_sj = corr_sj.where(np.triu(np.ones(corr_sj.shape), k=1).astype(np.bool))
to_drop_sj = [column for column in corr_upper_sj.columns if any(corr_upper_sj[column] > 0.95) or any(corr_upper_iq[column] < -0.95)]
to_drop_sj

In [None]:
records_iq.drop(to_drop_iq, axis=1, inplace=True)
records_sj.drop(to_drop_sj, axis=1, inplace=True)

We don't want to use same features twice, so we dropped columns more than 95% similar. 

In [None]:
records_iq.drop(['year'], axis=1, inplace=True)
records_sj.drop(['year'], axis=1, inplace=True)

We have `week_start_date`, so `year` and `weekofyear` are redundant. Wait! `weekofyear` can be thought to contain all unobserved seasonal variations. But `year` is too large to be a trend predictor. 

In [None]:
records_iq.columns

In [None]:
records_iq[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
            'reanalysis_min_air_temp_k']] -= 273.15
records_sj[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
            'reanalysis_min_air_temp_k']] -= 273.15

Convert all Kelvins to Celsius.

In [None]:
plt.figure()
records_iq[['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].plot(figsize=(100, 5))

In [None]:
records_iq['ndvi'] = records_iq[['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].mean(axis=1)
records_iq.drop(['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw'], axis=1, inplace=True)

In [None]:
plt.figure()
records_sj[['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].plot(figsize=(100, 5))

In [None]:
records_sj['ndvi_n'] = records_sj[['ndvi_ne', 'ndvi_nw']].mean(axis=1)
records_sj['ndvi_s'] = records_sj[['ndvi_se', 'ndvi_sw']].mean(axis=1)
records_sj.drop(['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw'], axis=1, inplace=True)

In [None]:
plt.figure()
records_iq[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
            'reanalysis_min_air_temp_k']].plot(figsize=(100, 5))

Doesn't look like further reduceable. 

In [None]:
plt.figure()
records_sj[['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
            'reanalysis_min_air_temp_k']].plot(figsize=(100, 5))

In [None]:
records_sj['reanalysis_temp'] = records_sj[
    ['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 
     'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']].mean(axis=1)
records_sj.drop(['reanalysis_air_temp_k', 'reanalysis_dew_point_temp_k', 
     'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k'], axis=1, inplace=True)

In [None]:
plt.figure()
records_iq[['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c', 'station_min_temp_c']].plot(figsize=(50, 10))

In [None]:
records_iq[['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c', 'station_min_temp_c']].isnull().sum()

Variation of max, min and avg is quite similar while diur thing looks totally random. Avg looks very much like the average of max and min where when one of max or min is absent, avg is also absent. Hence we can go with min. 

In [None]:
records_iq.drop(['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c'], axis=1, inplace=True)

In [None]:
plt.figure()
records_sj[['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c', 'station_min_temp_c']].plot(figsize=(50, 10))

In [None]:
records_sj[['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c', 'station_min_temp_c']].isnull().sum()

In [None]:
records_sj.drop(['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c'], axis=1, inplace=True)

In [None]:
records_iq.columns

In [None]:
records_sj.columns

In [None]:
records_iq.to_csv('./generated/1-features-iq.csv', index=False)
records_sj.to_csv('./generated/1-features-sj.csv', index=False)

In [None]:
training_targets = pd.read_csv('../dengue_labels_train.csv')

In [None]:
training_targets.drop(['year'], axis=1, inplace=True)

In [None]:
training_targets['week_start_date'] = training_records['week_start_date']

In [None]:
training_targets_sj = training_targets[training_targets['city'] == 'sj'].drop('city', 1)
training_targets_iq = training_targets[training_targets['city'] == 'iq'].drop('city', 1)

In [None]:
training_targets_iq.to_csv('./generated/1-labels-train-iq.csv', index=False)
training_targets_sj.to_csv('./generated/1-labels-train-sj.csv', index=False)

Mosquito life cycle: 8-10 days.
https://www.cdc.gov/dengue/entomologyecology/m_lifecycle.html

Incubation of dengue virus in human body can be upto 14 days. 
https://en.wikipedia.org/wiki/Dengue_fever

Incubation inside mosquito: 8-12 days. 
https://www.cdc.gov/dengue/epidemiology/index.html

Mosquito lives upto 4 weeks. 

It looks safe to take a 5 week window. 