# Preprocess

In [None]:
%matplotlib inline

import pandas
import numpy
import seaborn
import matplotlib.pyplot as plotter

## Import train and test data selectively (without labels)

We import the data skipping the columns `year` and `week_start_date` for now. 

In [None]:
selected_columns = list(range(0, 24))
# selected_columns.remove(1) # year
selected_columns.remove(3) # week_start_date

In [None]:
features_train = pandas.read_csv(filepath_or_buffer='dengue_features_train.csv', 
                                 usecols=selected_columns)

In [None]:
features_test = pandas.read_csv(filepath_or_buffer='dengue_features_test.csv', 
                                usecols=selected_columns)

In [None]:
features_train.columns

Celsius to Kelvin conversion:

In [None]:
features_train[['station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c']] += 273.15
features_test[['station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c']] += 273.15

In [None]:
features_train.head()

## Separating data based on apparent differences

In [None]:
features_train.groupby('city').mean()

The two cities `sj` and `iq` show different properties. So we separate them.  
Since we split the dataframe by 'city', we don't need it anymore. 

In [None]:
features_train_sj = features_train[features_train['city'] == 'sj'].drop('city', 1)
features_train_iq = features_train[features_train['city'] == 'iq'].drop('city', 1)

In [None]:
features_test_sj = features_test[features_test['city'] == 'sj'].drop('city', 1)
features_test_iq = features_test[features_test['city'] == 'iq'].drop('city', 1)

## Feature engineering

In [None]:
correlations_sj_1 = features_train_sj.corr()
correlations_iq_1 = features_train_iq.corr()
correlations_sj_2 = features_test_sj.corr()
correlations_iq_2 = features_test_iq.corr()

### Iquitos

In [None]:
(features_train_iq.drop(['year', 'weekofyear'], axis=1)
    .plot(kind='kde', subplots=True, figsize=(24, 40), layout=(10, 2), sharex=False))

In [None]:
plotter.figure(figsize=(8, 6))
correlations_iq_heatmap_1 = seaborn.heatmap(correlations_iq_1)
plotter.title('Iquitos correlations')

In [None]:
plotter.figure(figsize=(8, 6))
correlations_iq_heatmap_2 = seaborn.heatmap(correlations_iq_2)
plotter.title('Iquitos correlations')

In [None]:
plotter.figure()
features_train_iq.loc[:, ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].plot(figsize=(100, 5))

All `ndvi`s are awfully similar and this pattern is consistent even with the test data set. 

In [None]:
features_train_iq['f_ndvi'] = features_train_iq[['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].mean(axis=1)
plotter.figure()
features_train_iq.plot.scatter(figsize=(100, 5), 
                               x=['weekofyear', 'weekofyear', 'weekofyear', 'weekofyear', 'weekofyear'], 
                               y=['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'f_ndvi'], 
                               c=['red', 'green', 'blue', 'yellow', 'black'])

In [None]:
features_test_iq['f_ndvi'] = features_test_iq[['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].mean(axis=1)
plotter.figure()
features_test_iq.plot.scatter(figsize=(100, 5), 
                              x=['weekofyear', 'weekofyear', 'weekofyear', 'weekofyear', 'weekofyear'], 
                              y=['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'f_ndvi'], 
                              c=['red', 'green', 'blue', 'yellow', 'black'])

In [None]:
plotter.figure()
features_train_iq.plot(figsize=(100, 5), y='f_ndvi', c='black')

In [None]:
plotter.figure()
features_test_iq.plot(figsize=(100, 5), y='f_ndvi', c='black')

It looks like we can merge the four and just get an average. 

In [None]:
features_train_iq = features_train_iq.drop(['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw'], axis=1)
features_test_iq = features_test_iq.drop(['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw'], axis=1)

In [None]:
plotter.figure()
features_train_iq.loc[:, ['precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm']].plot(figsize=(100, 5))

Perfectly overlaps!

In [None]:
features_train_iq = features_train_iq.drop('reanalysis_sat_precip_amt_mm', axis=1)
features_test_iq = features_test_iq.drop('reanalysis_sat_precip_amt_mm', axis=1)

In [None]:
plotter.figure()
features_train_iq.loc[:, 
                      ['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 
                       'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
                       'reanalysis_min_air_temp_k']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, 
                     ['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 
                      'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
                      'reanalysis_min_air_temp_k']].plot(figsize=(100, 5))

Patterns are similar. Especially, `reanalysis_air_temp_k` and `reanalysis_avg_temp_k` are very similar. `reanalysis_dew_point_temp_k` and `reanalysis_min_air_temp_k` are similar too. The pattern of `reanalysis_max_air_temp_k` is also roughly similar but has higher values. 

In [None]:
features_train_iq['f_air_avg_temp'] = features_train_iq[
    ['reanalysis_air_temp_k', 'reanalysis_avg_temp_k']].mean(axis=1)
features_train_iq['f_dew_point_min_air_temp'] = features_train_iq[
    ['reanalysis_dew_point_temp_k', 'reanalysis_min_air_temp_k']].mean(axis=1)
plotter.figure()
features_train_iq.loc[:, 
                      ['reanalysis_max_air_temp_k', 'f_air_avg_temp', 
                       'f_dew_point_min_air_temp']].plot(figsize=(100, 5))

In [None]:
features_test_iq['f_air_avg_temp'] = features_test_iq[
    ['reanalysis_air_temp_k', 'reanalysis_avg_temp_k']].mean(axis=1)
features_test_iq['f_dew_point_min_air_temp'] = features_test_iq[
    ['reanalysis_dew_point_temp_k', 'reanalysis_min_air_temp_k']].mean(axis=1)
plotter.figure()
features_test_iq.loc[:, 
                     ['reanalysis_max_air_temp_k', 'f_air_avg_temp', 
                      'f_dew_point_min_air_temp']].plot(figsize=(100, 5))

In [None]:
features_train_iq = features_train_iq.drop(['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 
                                            'reanalysis_dew_point_temp_k', 'reanalysis_min_air_temp_k'],
                                           axis=1)
features_test_iq = features_test_iq.drop(['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 
                                          'reanalysis_dew_point_temp_k', 'reanalysis_min_air_temp_k'],
                                         axis=1)

In [None]:
plotter.figure()
features_train_iq.loc[:, ['reanalysis_precip_amt_kg_per_m2']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['reanalysis_precip_amt_kg_per_m2']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_train_iq.loc[:, ['reanalysis_relative_humidity_percent']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['reanalysis_relative_humidity_percent']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_train_iq.loc[:, ['reanalysis_specific_humidity_g_per_kg']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['reanalysis_specific_humidity_g_per_kg']].plot(figsize=(100, 5))

We won't need this since this is highly correlated with the dew point, already included in temperature readings. 

In [None]:
features_train_iq = features_train_iq.drop('reanalysis_specific_humidity_g_per_kg', axis=1)
features_test_iq = features_test_iq.drop('reanalysis_specific_humidity_g_per_kg', axis=1)

In [None]:
plotter.figure()
features_train_iq.loc[:, ['reanalysis_tdtr_k']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['reanalysis_tdtr_k']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_train_iq.loc[:, ['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c', 'station_min_temp_c']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['station_avg_temp_c', 'station_diur_temp_rng_c', 
                         'station_max_temp_c', 'station_min_temp_c']].plot(figsize=(100, 5))

Looks like straight lines with noise. Probably all of these can be eliminated -- or can be combined to make a new feature. **Look at the names! You can just use `avg`.** But since some data is missing from `avg`, a new combined average can be used. Some interpolation is also needed. 

In [None]:
features_train_iq['f_station_temp'] = features_train_iq[
    ['station_avg_temp_c', 'station_max_temp_c', 'station_min_temp_c']].interpolate().mean(axis=1)
features_test_iq['f_station_temp'] = features_test_iq[
    ['station_avg_temp_c', 'station_max_temp_c', 'station_min_temp_c']].interpolate().mean(axis=1)

In [None]:
plotter.figure()
features_train_iq.loc[:, ['f_station_temp', 'station_diur_temp_rng_c']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['f_station_temp', 'station_diur_temp_rng_c']].plot(figsize=(100, 5))

In [None]:
features_train_iq = features_train_iq.drop(['station_avg_temp_c', 
                                            'station_max_temp_c', 'station_min_temp_c'], axis=1)
features_test_iq = features_test_iq.drop(['station_avg_temp_c', 
                                          'station_max_temp_c', 'station_min_temp_c'], axis=1)

In [None]:
plotter.figure()
features_train_iq.loc[:, ['station_precip_mm']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_iq.loc[:, ['station_precip_mm']].plot(figsize=(100, 5))

In [None]:
features_train_iq.columns

In [None]:
features_test_iq.columns

### San Juan

In [None]:
(features_train_sj.drop(['year', 'weekofyear'], axis=1)
    .plot(kind='kde', subplots=True, figsize=(24, 40), layout=(10, 2), sharex=False))

In [None]:
plotter.figure(figsize=(8, 6))
correlations_sj_heatmap_1 = seaborn.heatmap(correlations_sj_1)
plotter.title('San Juan correlations')

In [None]:
plotter.figure(figsize=(8, 6))
correlations_sj_heatmap_2 = seaborn.heatmap(correlations_sj_2)
plotter.title('San Juan correlations')

In [None]:
plotter.figure()
features_train_sj.loc[:, ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw']].plot(figsize=(100, 5))

Unlike the Iquitos data set, all features cannot be combined. Instead, `ndvi_ne` + `ndvi_nw` and `ndvi_se` + `ndvi_sw` can make pairs. The data looks inaccurate and much of the data is missing. 

In [None]:
features_train_sj['f_ndvi_n'] = features_train_sj[['ndvi_ne', 'ndvi_nw']].interpolate().mean(axis=1)
features_train_sj['f_ndvi_s'] = features_train_sj[['ndvi_se', 'ndvi_sw']].interpolate().mean(axis=1)
plotter.figure()
features_train_sj.plot.scatter(figsize=(100, 5), 
                               x=['weekofyear', 'weekofyear', 'weekofyear', 'weekofyear', 
                                  'weekofyear', 'weekofyear'], 
                               y=['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'f_ndvi_n', 
                                  'f_ndvi_s'], 
                               c=['red', 'green', 'blue', 'yellow', 'black', 'brown'])

In [None]:
features_test_sj['f_ndvi_n'] = features_test_sj[['ndvi_ne', 'ndvi_nw']].interpolate().mean(axis=1)
features_test_sj['f_ndvi_s'] = features_test_sj[['ndvi_se', 'ndvi_sw']].interpolate().mean(axis=1)
plotter.figure()
features_test_sj.plot.scatter(figsize=(100, 5), 
                              x=['weekofyear', 'weekofyear', 'weekofyear', 'weekofyear', 
                                 'weekofyear', 'weekofyear'], 
                              y=['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'f_ndvi_n',
                                 'f_ndvi_s'], 
                              c=['red', 'green', 'blue', 'yellow', 'black', 'brown'])

In [None]:
plotter.figure()
features_train_sj.loc[:, ['f_ndvi_n', 'f_ndvi_s']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['f_ndvi_n', 'f_ndvi_s']].plot(figsize=(100, 5))

In [None]:
features_train_sj = features_train_sj.drop(['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',], axis=1)
features_test_sj = features_test_sj.drop(['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',], axis=1)

In [None]:
plotter.figure()
features_train_sj.loc[:, ['precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm']].plot(figsize=(100, 5))

Redundant data -- all the same. 

In [None]:
features_train_sj = features_train_sj.drop('reanalysis_sat_precip_amt_mm', axis=1)
features_test_sj = features_test_sj.drop('reanalysis_sat_precip_amt_mm', axis=1)

In [None]:
plotter.figure()
features_train_sj.loc[:, ['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 
                          'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
                          'reanalysis_min_air_temp_k']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 
                         'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
                         'reanalysis_min_air_temp_k']].plot(figsize=(100, 5))

Since the patterns are very similar, merging them should yield no harm. 

In [None]:
features_train_sj['f_temp'] = features_train_sj[
    ['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 
     'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']].mean(axis=1)
plotter.figure()
features_train_sj.loc[:, ['f_temp']].plot(figsize=(100, 5))

In [None]:
features_test_sj['f_temp'] = features_test_sj[
    ['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k', 
     'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k']].mean(axis=1)
plotter.figure()
features_test_sj.loc[:, ['f_temp']].plot(figsize=(100, 5))

In [None]:
features_train_sj = features_train_sj.drop(['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 
                                            'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
                                            'reanalysis_min_air_temp_k'],
                                           axis=1)
features_test_sj = features_test_sj.drop(['reanalysis_air_temp_k', 'reanalysis_avg_temp_k', 
                                          'reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k', 
                                          'reanalysis_min_air_temp_k'],
                                         axis=1)

In [None]:
plotter.figure()
features_train_sj.loc[:, ['reanalysis_precip_amt_kg_per_m2']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['reanalysis_precip_amt_kg_per_m2']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_train_sj.loc[:, ['reanalysis_relative_humidity_percent']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['reanalysis_relative_humidity_percent']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_train_sj.loc[:, ['reanalysis_specific_humidity_g_per_kg']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['reanalysis_specific_humidity_g_per_kg']].plot(figsize=(100, 5))

Similar to the parameters in Iquitos, we can omit this. 

In [None]:
features_train_sj = features_train_sj.drop(['reanalysis_specific_humidity_g_per_kg'], axis=1)
features_test_sj =  features_test_sj.drop(['reanalysis_specific_humidity_g_per_kg'], axis=1)

In [None]:
plotter.figure()
features_train_sj.loc[:, ['reanalysis_tdtr_k']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['reanalysis_tdtr_k']].plot(figsize=(100, 5))

Looks very random. 

In [None]:
plotter.figure()
features_train_sj.loc[:, ['station_avg_temp_c', 'station_diur_temp_rng_c', 
                          'station_max_temp_c', 'station_min_temp_c']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['station_avg_temp_c', 'station_diur_temp_rng_c', 
                         'station_max_temp_c', 'station_min_temp_c']].plot(figsize=(100, 5))

Unlike in Iquitos, these lines don't seem like straight lines with noise (maybe except `station_diur_temp_rng_c`). 3 of them can be combined. **In another look, the 3 that can be combined don't need a combination: instead of `max`, `avg` and `min`, just `avg` alone can be used.** This must be true also for Iquitos. 

In [None]:
features_train_sj['f_station_temp'] = features_train_sj[['station_avg_temp_c']].interpolate()
features_test_sj['f_station_temp'] = features_test_sj[['station_avg_temp_c']].interpolate()
features_train_sj = features_train_sj.drop([
    'station_avg_temp_c', 'station_max_temp_c', 'station_min_temp_c'], axis=1)
features_test_sj = features_test_sj.drop([
    'station_avg_temp_c', 'station_max_temp_c', 'station_min_temp_c'], axis=1)

In [None]:
plotter.figure()
features_train_sj.loc[:, ['station_precip_mm']].plot(figsize=(100, 5))

In [None]:
plotter.figure()
features_test_sj.loc[:, ['station_precip_mm']].plot(figsize=(100, 5))

In [None]:
features_train_sj.columns

In [None]:
features_test_sj.columns

## Processing outliers

## Impute missing values

In [None]:
features_train_sj = features_train_sj.interpolate()
features_train_iq = features_train_iq.interpolate()
features_test_iq = features_test_iq.interpolate()
features_test_sj = features_test_sj.interpolate()

## Preprocess training labels

In [None]:
labels_train = pandas.read_csv(filepath_or_buffer='dengue_labels_train.csv', usecols=[0, 3])

In [None]:
labels_train_sj = labels_train[labels_train['city'] == 'sj'].drop('city', 1)
labels_train_iq = labels_train[labels_train['city'] == 'iq'].drop('city', 1)

## Combine data and labels

In [None]:
features_train_sj_labels = pandas.concat([features_train_sj, labels_train_sj], axis=1)
features_train_iq_labels = pandas.concat([features_train_iq, labels_train_iq], axis=1)

## Get a little more insight

In [None]:
correlations_sj = features_train_sj_labels.corr()
correlations_iq = features_train_iq_labels.corr()

In [None]:
plotter.figure(figsize=(8, 6))
correlations_iq_heatmap = seaborn.heatmap(correlations_iq)
plotter.title('Iquitos correlations')

In [None]:
plotter.figure(figsize=(8, 6))

correlations_sj_heatmap = seaborn.heatmap(correlations_sj)
plotter.title('San Juan correlations')

In [None]:
plotter.figure(figsize=(10, 5))
(correlations_iq
     .total_cases
     .drop('total_cases') # don't compare with myself
     .sort_values(ascending=False)
     .plot
     .barh())

In [None]:
plotter.figure(figsize=(10, 5))
(correlations_sj
     .total_cases
     .drop('total_cases') # don't compare with myself
     .sort_values(ascending=False)
     .plot
     .barh())

## Save the resultant datasets

In [None]:
features_train_sj.to_csv('./generated/dengue_features_train_sj.csv', index=False)
features_test_sj.to_csv('./generated/dengue_features_test_sj.csv', index=False)
features_train_iq.to_csv('./generated/dengue_features_train_iq.csv', index=False)
features_test_iq.to_csv('./generated/dengue_features_test_iq.csv', index=False)

In [None]:
labels_train_sj.to_csv('./generated/dengue_labels_train_sj.csv', index=False)
labels_train_iq.to_csv('./generated/dengue_labels_train_iq.csv', index=False)

In [None]:
features_train_sj_labels.to_csv('./generated/dengue_features_labels_train_sj.csv', index=False)
features_train_iq_labels.to_csv('./generated/dengue_features_labels_train_iq.csv', index=False)

## Smoothing and considering time series

In [None]:
features_train_iq_s1_labels = pandas.concat([
    features_train_iq[['year', 'weekofyear']], # no smooth or repeat
    features_train_iq[features_train_iq.columns.difference(['year', 'weekofyear'])]
    .shift(1).rename(columns=lambda s: 's1_' + s).rolling(5).mean(),
    features_train_iq[features_train_iq.columns.difference(['year', 'weekofyear'])].rolling(5).mean(),
    labels_train_iq
], axis=1).bfill()

features_train_sj_s1_labels = pandas.concat([
    features_train_sj[['year', 'weekofyear']], # no smooth or repeat
    features_train_sj[features_train_sj.columns.difference(['year', 'weekofyear'])]
    .shift(1).rename(columns=lambda s: 's1_' + s).rolling(5).mean(),
    features_train_sj[features_train_sj.columns.difference(['year', 'weekofyear'])].rolling(5).mean(),
    labels_train_sj
], axis=1).bfill()

In [None]:
correlations_iq_s1 = features_train_iq_s1_labels.corr()
correlations_sj_s1 = features_train_sj_s1_labels.corr()

In [None]:
plotter.figure(figsize=(8, 6))
correlations_iq_s1_heatmap = seaborn.heatmap(correlations_iq_s1)
plotter.title('Iquitos correlations')

In [None]:
plotter.figure(figsize=(8, 6))
correlations_sj_s1_heatmap = seaborn.heatmap(correlations_sj_s1)
plotter.title('Iquitos correlations')

In [None]:
plotter.figure(figsize=(10, 8))
(correlations_iq_s1
     .total_cases
     .drop('total_cases') # don't compare with myself
     .sort_values(ascending=False)
     .plot
     .barh())

In [None]:
plotter.figure(figsize=(10, 8))
(correlations_sj_s1
     .total_cases
     .drop('total_cases') # don't compare with myself
     .sort_values(ascending=False)
     .plot
     .barh())

In [None]:
features_train_iq_s1_labels.drop(['year'], axis=1).plot(figsize=(100, 8))

In [None]:
features_train_sj_s1_labels.drop(['year'], axis=1).plot(figsize=(100, 8))

In [None]:
features_test_iq_s1 = pandas.concat([
    features_test_iq[['year', 'weekofyear']], # no smooth or repeat
    features_test_iq[features_test_iq.columns.difference(['year', 'weekofyear'])]
    .shift(1).rename(columns=lambda s: 's1_' + s).rolling(5).mean(),
    features_test_iq[features_test_iq.columns.difference(['year', 'weekofyear'])].rolling(5).mean()
], axis=1).bfill()

features_test_sj_s1 = pandas.concat([
    features_test_sj[['year', 'weekofyear']], # no smooth or repeat
    features_test_sj[features_test_sj.columns.difference(['year', 'weekofyear'])]
    .shift(1).rename(columns=lambda s: 's1_' + s).rolling(5).mean(),
    features_test_sj[features_test_sj.columns.difference(['year', 'weekofyear'])].rolling(5).mean()
], axis=1).bfill()

In [None]:
features_test_sj_s1.to_csv('./generated/dengue_features_test_s1_sj.csv', index=False)
features_test_iq_s1.to_csv('./generated/dengue_features_test_s1_iq.csv', index=False)

features_train_sj_s1_labels.to_csv('./generated/dengue_features_labels_train_s1_sj.csv', index=False)
features_train_iq_s1_labels.to_csv('./generated/dengue_features_labels_train_s1_iq.csv', index=False)