Most of these data are time series. We can use R's STL to differentiate seasonal, trend and residual components. (It looks slightly better and smooth when compared with `statsmodels`'s seasonal_decompose.)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from rpy2.robjects import r

from IPython.display import display, HTML
%matplotlib inline

In [None]:
records_iq = pd.read_csv(
    './generated/1-features-iq.csv', 
    parse_dates=['week_start_date'], 
    index_col='week_start_date')
records_sj = pd.read_csv(
    './generated/1-features-sj.csv', 
    parse_dates=['week_start_date'], 
    index_col='week_start_date')

Impute data using linear interpolation to get a first approximation. 

In [None]:
records_iq_i1 = records_iq.interpolate().drop(['weekofyear'], axis=1)
records_sj_i1 = records_sj.interpolate().drop(['weekofyear'], axis=1)

A moving average to smooth things out. 

In [None]:
records_iq_i1 = records_iq_i1.rolling(5).mean().bfill()
records_sj_i1 = records_sj_i1.rolling(5).mean().bfill()

In [None]:
def stl_decompose(df, column, freq=52):
    dfd = pd.DataFrame(index=df.index)
    series = list(df[column].values)
    length = len(series)
    rts = r.ts(series, frequency=freq)
    decomposed = list(r.stl(rts, 'periodic', robust=True).rx2('time.series'))
    dfd['trend'] = decomposed[length:2*length]
    dfd['seasonal'] = decomposed[0:length]
    dfd['residuals'] = decomposed[2*length:3*length]
    
    return dfd

In [None]:
def stl_multi_decompose(df):
    trend = pd.DataFrame(index=df.index)
    seasonal = pd.DataFrame(index=df.index)
    residuals = pd.DataFrame(index=df.index)
    
    for col in df.columns:
        dfd = stl_decompose(df, col)
        trend[col] = dfd['trend']
        seasonal[col] = dfd['seasonal']
        residuals[col] = dfd['residuals']
    
    display(HTML(pd.DataFrame({
        'trend_mean': trend.mean(), 'seasonal_mean': seasonal.mean(), 'residuals_mean': residuals.mean(), 
        'trend_std': trend.std(), 'seasonal_std': seasonal.std(), 'residuals_std': residuals.std()
    }).to_html()))
    return {'trend': trend, 'seasonal': seasonal, 'residuals': residuals}

In [None]:
def correlation_heatmap_drop(df, title, f):
    corr = df.corr()
    plt.figure(figsize=(8, 6))
    corr_hm = sns.heatmap(corr)
    plt.title(title)
    
    corr_upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
    to_drop = [column for column in corr_upper.columns 
               if any(corr_upper[column] > f) or any(corr_upper[column] < -f)]
    return to_drop

In [None]:
def drop_extreme_correlated(dec_df, f=0.95):
    for k, v in dec_df.items():
        to_drop = correlation_heatmap_drop(v, 'Iquitos' + k, f)
        print('Dropping ' + str(to_drop) + ' in ' + k)
        v.drop(to_drop, axis=1, inplace=True)

In [None]:
dec_iq = stl_multi_decompose(records_iq_i1)

In [None]:
drop_extreme_correlated(dec_iq)

In [None]:
dec_sj = stl_multi_decompose(records_sj_i1)

In [None]:
drop_extreme_correlated(dec_sj)

NDVIs show correlations inside residuals. Maybe this is not exactly a time series. 

In [None]:
stl_dec_iq = pd.concat(map(lambda kvp: kvp[1].add_prefix(kvp[0] + '_'), dec_iq.items()), axis=1)
stl_dec_sj = pd.concat(map(lambda kvp: kvp[1].add_prefix(kvp[0] + '_'), dec_sj.items()), axis=1)

In [None]:
stl_dec_iq.describe()

In [None]:
stl_dec_sj.describe()

In [None]:
stl_dec_iq.plot(subplots=True, sharey=False, figsize=(12, 120))

In [None]:
stl_dec_sj.plot(subplots=True, sharey=False, figsize=(12, 120))

In [None]:
stl_dec_iq.to_csv('./generated/2-time-dec-features-iq.csv', index_label='week_start_date')
stl_dec_sj.to_csv('./generated/2-time-dec-features-sj.csv', index_label='week_start_date')

---

 Try loading total cases:

In [None]:
training_targets_iq = pd.read_csv(
    './generated/1-labels-train-iq.csv', 
    parse_dates=['week_start_date'], 
    index_col='week_start_date')
training_targets_sj = pd.read_csv(
    './generated/1-labels-train-sj.csv', 
    parse_dates=['week_start_date'], 
    index_col='week_start_date')

In [None]:
# int not supported - float only
training_targets_iq['total_cases'] += 0.0
training_targets_sj['total_cases'] += 0.0

In [None]:
dec_targets_iq = stl_decompose(training_targets_iq, 'total_cases')
dec_targets_sj = stl_decompose(training_targets_sj, 'total_cases')

In [None]:
dec_targets_iq.plot(figsize=(14, 10))

In [None]:
dec_targets_sj.plot(figsize=(14, 10))

Looks pointless. 

In [None]:
dec_targets_iq.to_csv('./generated/1-labels-iq.csv')
dec_targets_sj.to_csv('./generated/1-labels-sj.csv')