In [1]:
import pandas as pd
import numpy as np
import os
import sys


In [2]:
df = pd.read_csv('Train_timeseries.csv', sep=',')
df_num = df.select_dtypes(include=[np.number])

In [3]:
df.head()

Unnamed: 0,Date,Series1,Series2,Series3,Series4,Series5,Series6
0,2012-01-01 00:00:00,0.458882,-1.358177,1.893863,1.022005,-1.810638,-0.792524
1,2012-01-01 00:01:00,0.398794,-1.356111,2.011162,1.104435,-1.807173,-0.72688
2,2012-01-01 00:02:00,0.480416,-1.35405,2.116713,1.134004,-1.803721,-0.816924
3,2012-01-01 00:03:00,0.319643,-1.351994,1.987292,0.99625,-1.800283,-1.010524
4,2012-01-01 00:04:00,0.361104,-1.349943,2.059825,1.041442,-1.796859,


In [4]:
# convert 'Date' to datetime and split into separate columns
df['Date'] = pd.to_datetime(df['Date'])
df['Year']   = df['Date'].dt.year
df['Month']  = df['Date'].dt.month
df['Day']    = df['Date'].dt.day
df['Hour']   = df['Date'].dt.hour
df['Minute'] = df['Date'].dt.minute
df['DayOfWeek'] = df['Date'].dt.dayofweek

In [5]:
df

Unnamed: 0,Date,Series1,Series2,Series3,Series4,Series5,Series6,Year,Month,Day,Hour,Minute,DayOfWeek
0,2012-01-01 00:00:00,0.458882,-1.358177,1.893863,1.022005,-1.810638,-0.792524,2012,1,1,0,0,6
1,2012-01-01 00:01:00,0.398794,-1.356111,2.011162,1.104435,-1.807173,-0.726880,2012,1,1,0,1,6
2,2012-01-01 00:02:00,0.480416,-1.354050,2.116713,1.134004,-1.803721,-0.816924,2012,1,1,0,2,6
3,2012-01-01 00:03:00,0.319643,-1.351994,1.987292,0.996250,-1.800283,-1.010524,2012,1,1,0,3,6
4,2012-01-01 00:04:00,0.361104,-1.349943,2.059825,1.041442,-1.796859,,2012,1,1,0,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
938367,2013-10-13 15:27:00,-0.763961,-1.670369,1.632822,0.327631,-1.465975,-2.002986,2013,10,13,15,27,6
938368,2013-10-13 15:28:00,-0.929016,-1.799173,1.903091,0.607440,-1.464328,-2.056986,2013,10,13,15,28,6
938369,2013-10-13 15:29:00,-1.058803,-1.798737,1.809294,0.673262,-1.462685,-1.881761,2013,10,13,15,29,6
938370,2013-10-13 15:30:00,-0.996506,-1.798323,1.675207,0.526339,-1.461045,-1.879009,2013,10,13,15,30,6


In [6]:
def fill_nans_with_running_mean(series, window=10):
    running_mean = series.rolling(window=window, min_periods=1, center=True).mean()
    return series.fillna(running_mean)


df_filled = df_num.copy()
for col in df_filled.columns:
    df_filled[col] = fill_nans_with_running_mean(df_filled[col], window=10)

In [7]:
df_filled

Unnamed: 0,Series1,Series2,Series3,Series4,Series5,Series6
0,0.458882,-1.358177,1.893863,1.022005,-1.810638,-0.792524
1,0.398794,-1.356111,2.011162,1.104435,-1.807173,-0.726880
2,0.480416,-1.354050,2.116713,1.134004,-1.803721,-0.816924
3,0.319643,-1.351994,1.987292,0.996250,-1.800283,-1.010524
4,0.361104,-1.349943,2.059825,1.041442,-1.796859,-0.882452
...,...,...,...,...,...,...
938367,-0.763961,-1.670369,1.632822,0.327631,-1.465975,-2.002986
938368,-0.929016,-1.799173,1.903091,0.607440,-1.464328,-2.056986
938369,-1.058803,-1.798737,1.809294,0.673262,-1.462685,-1.881761
938370,-0.996506,-1.798323,1.675207,0.526339,-1.461045,-1.879009


In [8]:
df[df_filled.columns] = df_filled.copy()


In [9]:
df.isna().sum()

Date         0
Series1      0
Series2      0
Series3      0
Series4      0
Series5      0
Series6      0
Year         0
Month        0
Day          0
Hour         0
Minute       0
DayOfWeek    0
dtype: int64

In [10]:

def create_lag_features(df, lags=[], target_offset=240):
    X, y = [], []
    for i in range(lags[-1], len(df)-target_offset):
        X.append(df.iloc[i-lags].values.flatten())
        y.append(df.iloc[i+target_offset].values.flatten())
    return np.array(X), np.array(y)

In [11]:
X, Y = create_lag_features(df_filled, lags=np.array([0, 1, 2, 4, 6, 8, 20, 50, 1440, 5963, 10080, 23853]), target_offset=240)

In [None]:

df['series2_3_ratio'] = df['Series2'] / (df['Series3'] + 1e-6)
df['series3_4_ratio'] = df['Series3'] / (df['Series4'] + 1e-6)  # Positive correlation


In [None]:
X.shape, Y.shape

((914279, 72), (914279, 6))

In [None]:
import numpy as np

# add date features (Year, Month, Day, Hour, Minute) to X

lags = np.array([0, 1, 2, 4, 6, 8, 20, 50, 1440, 5963, 10080, 23853])
target_offset = 240

date_cols = ['Year', 'Month', 'Day', 'Hour', 'Minute','DayOfWeek']
date_vals = df[date_cols].values

corr_feat = ['series2_3_ratio', 'series3_4_ratio']
corr_val = df[corr_feat].values

# sample indices used in create_lag_features
idx = np.arange(lags[-1], len(df) - target_offset)

# pick date features at prediction time (i + target_offset)
date_feats = date_vals[idx]
corr_vals = corr_val[idx]

# concatenate to X
X = np.hstack([X, corr_vals])
X = np.hstack([X, date_feats])
X = np.hstack([X, Y])

In [None]:
X

array([[-1.06539941, -0.93573413,  1.25622948, ..., -2.19551309,
        -2.79460914, -1.24475608],
       [-1.3534831 ,  1.57090173, -0.36077586, ..., -2.10111224,
        -2.78500828, -1.14449598],
       [-1.31216567,  0.81070845, -0.2306762 , ..., -2.11537699,
        -2.77549087, -1.12422014],
       ...,
       [-0.45087813, -0.99646228,  0.92121822, ...,  0.67326176,
        -1.46268458, -1.88176112],
       [-0.19030709, -0.99537579,  0.87500766, ...,  0.52633879,
        -1.46104473, -1.87900895],
       [-0.33389104, -0.99429225,  0.90247307, ...,  0.59974706,
        -1.45940833, -1.93355122]])

In [None]:
X.shape

(914279, 86)

In [None]:
df.isna().sum()

Date               0
Series1            0
Series2            0
Series3            0
Series4            0
Series5            0
Series6            0
Year               0
Month              0
Day                0
Hour               0
Minute             0
DayOfWeek          0
series2_3_ratio    0
series3_4_ratio    0
dtype: int64

In [None]:
pd.DataFrame(X).to_csv('X.csv', index=False)