In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
raw = pd.read_csv('datasets/viz_eda.csv')

In [None]:
raw.head()

In [None]:
df = raw.pivot(index = "date", columns = "series_id", values = "value")
df = df.fillna(method='ffill')

## Growth

In [None]:
ls = []
for g in raw.groupby("series_id"):
    tmp = g[1]
    tmp["diff"] = tmp["value"].diff()
    tmp = tmp.drop(columns = "value")
    ls.append(tmp.copy())

In [None]:
diff = pd.concat(ls).dropna().pivot(index = "date", columns = "series_id", values = "diff").drop(columns = "SP500")

In [None]:
diff.head()

In [None]:
diff = diff.fillna(method = "ffill")

## Technical Indecator

In [None]:
def get_technical(sp500):
    tech = sp500.copy()
    tech['ma7'] = tech["SP500"].rolling(7).mean()
    tech['ma21'] = tech["SP500"].rolling(21).mean()
    tech['26ema'] = tech["SP500"].ewm(span=26).mean()
    tech['12ema'] = tech["SP500"].ewm(span=12).mean()
    tech['MACD'] = (tech['12ema']-tech['26ema'])
    tech['20sd'] = tech["SP500"].rolling(20).std()
    tech['upper_band'] = tech['ma21'] + (tech['20sd']*2)
    tech['lower_band'] = tech['ma21'] - (tech['20sd']*2)
    tech['ema'] = tech["SP500"].ewm(com=0.5).mean()
    tech['momentum'] = tech["SP500"].diff()
    tech = tech.drop(columns = "SP500").shift(periods=1)
    tech["value"] = sp500["SP500"]
    return tech[["ma7","ma21","26ema","12ema","MACD","ema","momentum","upper_band", "lower_band"]]

In [None]:
train = pd.read_csv("datasets/observations_train.csv")
train = train[train["series_id"] == "SP500"]

In [None]:
test = pd.read_csv("datasets/observations_test.csv")
test = test[test["series_id"] == "SP500"]

In [None]:
merged = pd.concat([train,test])
merged = merged.pivot(index = "date", columns = "series_id", values = "value").dropna()

In [None]:
merged.index = merged.index.map(lambda x:x[:10])

In [None]:
merged.tail()

In [None]:
tech = get_technical(merged)

In [None]:
tech.to_csv("datasets/technical.csv")

In [None]:
tech.plot()

In [None]:
tech = (tech - tech.mean()) / tech.std()

## Fourier transforms 

In [None]:
import numpy as np
import pylab as pl
from numpy import fft

In [None]:
def fourierExtrapolation(x, n_predict, n_harm):
    n = x.size            
    t = np.arange(0, n)
    p = np.polyfit(t, x, 1)         # find linear trend in x
    x_notrend = x - p[0] * t        # detrended x
    x_freqdom = fft.fft(x_notrend)  # detrended x in frequency domain
    f = fft.fftfreq(n)              # frequencies
    indexes = list(range(n))
    # sort indexes by frequency, lower -> higher
    indexes.sort(key = lambda i: np.absolute(f[i]))
 
    t = np.arange(0, n + n_predict)
    restored_sig = np.zeros(t.size)
    for i in indexes[:1 + n_harm * 2]:
        ampli = np.absolute(x_freqdom[i]) / n   # amplitude
        phase = np.angle(x_freqdom[i])          # phase
        restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase)
    return restored_sig + p[0] * t

In [None]:
len(merged)

In [None]:
ft = merged.copy()

In [None]:
for harm in [3,5,10,100]:
    pred = []
    for i in range(2,len(merged)):
        pred.append(fourierExtrapolation(merged.values[max(0,i - 1000) :i,0],1, harm)[-1])
    ft["ft" + str(harm)] = [np.NaN,np.NaN] + pred

In [None]:
ft.tail(500).plot()

In [None]:
ft.to_csv("datasets/FT.csv")

In [None]:
ft = (ft - ft.mean())/ ft.std()

## Date

In [None]:
doy = pd.to_datetime(df.index.values).dayofyear

In [None]:
df.index.values

In [None]:
df['SIN_DATE'] = np.sin(2*np.pi*doy/365)
df['COS_DATE'] = np.cos(2*np.pi*doy/365)

## Merge Everything

In [None]:
df = df.merge(ft.drop(columns = "SP500"), how = "left", left_index=True, right_index=True)
df = df.merge(tech, how = "left", left_index=True, right_index=True)
df = df.merge(merged, how = "left", left_index=True, right_index=True)
df = df.merge(diff, how = "left", suffixes = [None, "_DIFF"], left_index=True, right_index=True)

In [None]:
df.to_csv("all_features.csv")