In [None]:
import numpy as np

import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

# Profiling notebook perfomance
from time import clock
start_notebook = clock()

In [None]:
# Open data file
df = pd.read_hdf('data/train.h5')
# df.set_index('id', inplace=True)

# Subsample for now...
df = df[::10]

In [None]:
# df.to_csv('data/train.csv')

In [None]:
excl = ['id', 'sample', 'y', 'timestamp']
cols = [c for c in df.columns if c not in excl]

Thanks to [wangruixin](https://www.kaggle.com/wangruixin/two-sigma-financial-modeling/randomforestregressor)

# Exploration

In [None]:
df.head()

In [None]:
print(df.shape)
print(df.columns)

from collections import Counter
col_prefix = [col.split('_')[0] for col in df.columns]
counter = Counter(col_prefix)

print(counter)

In [None]:
# df.set_index('timestamp')['fundamental_0'].plot()
# df.set_index('timestamp')['derived_0'].plot()
# df.set_index('timestamp')['technical_0'].plot()
# df.set_index('timestamp')['derived_3'].plot()
# df.set_index('timestamp')['technical_41'].plot()
# df.set_index('fundamental_0')['fundamental_1'].plot()

# Seasonal pattern?
# series = df.set_index('timestamp')['fundamental_0'].ffill()
# series = series.rolling(window=1000).mean()
# series.plot()

# Distribution of target in time
df.plot.hexbin('timestamp', 'y')

In [None]:
df[['timestamp', 'fundamental_0', 'derived_0', 'technical_0']].dropna().describe()

Observation from [anokas](https://www.kaggle.com/anokas/two-sigma-financial-modeling/two-sigma-time-travel-eda).

In [None]:
# How does the number of timestamps evolve?
diff = df.groupby('timestamp')['timestamp'].count().diff()
diff.plot()

# What is the frequency of the large peaks?
pd.Series(diff[diff > 10].index).diff()
print(diff[diff > 10].index)

In [None]:
# Count unique per columns
# nuniq = df.apply(pd.Series.nunique)
# nuniq = df.apply(lambda x: len(x.unique()))  # faster?
# print(nuniq)

# Round number before counting
# df.apply(lambda x: round(x, 3)).nunique()

# Count number of unique per column
# df[['fundamental_0', 'derived_0', 'technical_0']].apply(pd.Series.nunique)

Observation from [sudalairajkumar](https://www.kaggle.com/sudalairajkumar/two-sigma-financial-modeling/univariate-analysis-regression-lb-0-006)

In [None]:
# Correlation?
corr = df[cols].corrwith(df['y'], drop=True)
corr.plot.barh(figsize=(6,15))

corr[abs(corr) > 0.008]

# Cleaning

In [None]:
# target = df['y']
# df = df[col]

In [None]:
def remove_outliers(col):
    """Remove outliers from column."""
    
    # Ignore missing values
    col = col.dropna()
    
    # First quantile
    q_low = col.quantile(.25)
    q_high = col.quantile(.75)
    q_diff = q_high - q_low
    
    # Add buffer to quantile
    low = q_low - 1.5 * q_diff
    high = q_high + 1.5 * q_diff
    
    # Drop values outside range
    col[(col > high) | (col < low)] = np.nan
    
    return col

# Plot histogram after removing outliers
df.apply(remove_outliers).hist(
    layout=(-1, 4), figsize=(10, 60), bins=20, sharex=False, sharey=False
)

print(df.shape)

In [None]:
# Columns with thin histograms
cols_one = ['technical_13', 'technical_16', 'technical_18', 'technical_20', 'technical_30', 'technical_42', 'technical_9', 'technical_0', 'technical_12', 'technical_37', 'technical_38', 'technical_39']
cols_two = ['technical_10', 'technical_29', 'technical_14', 'technical_43', 'technical_6']
cols_three = ['technical_22', 'technical_34']
# Could also do some clustering instead

# Remove columns with one category
df.drop(cols_one, inplace=True, axis=1)

# One-hot encode categories
df[cols_two] = df[cols_two].apply(lambda x: x > -1)

# One-hot encode categories
for c in cols_three:
    df[c + '_A'] = df[c] > 2.5
    df[c + '_B'] = df[c] < -1.5
df.drop(cols_three, inplace=True, axis=1)

print(df.shape)

In [None]:
# Find outliers in target
outliers = df[remove_outliers(df['y']).isnull()].index
print(len(outliers))

# Remove outliers
df.drop(outliers, inplace=True)

df.plot.hexbin('timestamp', 'y')
print(df.shape)

In [None]:
# Fill missing values

# df = df.sort_values(by='id')
# df = df.sort_values(by='timestamp')
# df = df.sort_values(by='y')  # Assume similarity between nearby targets

# df = df.fillna(method='ffill')
# df = df.fillna(method='bfill')

mean_values = df.mean(axis=0)
df.fillna(mean_values, inplace=True)

In [None]:
df.head()

# Prediction

In [None]:
ind = 'timestamp'

target = 'y'
cols = [c for c in df.columns if c not in excl]
# cols = ['fundamental_17', 'fundamental_41', 'technical_19', 'fundamental_62', 'fundamental_48']

target = df.set_index(ind)[target]
feature = df.set_index(ind)[cols]

In [None]:
def split_train_test(feature, target, cutoff_test = 1000):
    """
    Divide features and targets into train and test
    """

    ind_test = df.index >= cutoff_test
    feature_test = feature[ind_test]
    target_test = target[ind_test]

    ind_train = ~ind_test
    feature_train = feature[ind_train]
    target_train = target[ind_train]
    
    return feature_train, feature_test, target_train, target_test

# Apply split
feature_train, feature_test, target_train, target_test = split_train_test(feature, target)

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# LassoCV since L1 norm promotes sparsity of features
clf = LassoCV()
# sfm = SelectFromModel(clf, threshold = 1e-7)
sfm = SelectFromModel(clf, threshold = "mean")
sfm.fit(feature_train, target_train)
# NOTE had to disable mkl as discussed here: https://github.com/BVLC/caffe/issues/3884

feature_kept = feature.columns[sfm.get_support()]
print("Features: {}".format(feature_kept))

In [None]:
# Keep only most important features
# feature_train = pd.DataFrame(sfm.transform(feature_train), 
#                              columns = feature_kept, index = feature_train.index)
# feature_test = pd.DataFrame(sfm.transform(feature_test), 
#                             columns = feature_kept, index = feature_test.index)

In [None]:
# Linear regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

# Quick cross validation
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(reg, feature, target, cv = 5)
print("R^2 during CV: {:.2f} +/- {:.2f}".format(scores.mean(), scores.std() * 2))
print(scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators = 100)

reg.fit(feature_train, target_train)

# Score on train/test set
from sklearn.metrics import r2_score

pred_train = reg.predict(feature_train)
print('train score: {}'.format(r2_score(target_train, pred_train)))

pred_test = reg.predict(feature_test)
print('test score: {}'.format(r2_score(target_test, pred_test)))

# Big difference! Overfitting?

In [None]:
pd.DataFrame(reg.feature_importances_, index=feature_train.columns).plot.barh(figsize=(6,15))

In [None]:
def mape(outcome, predict):
    """
    Compute Mean Absolute Percentage Error (MAPE) score. Positive, but lower is better.
    """
    
    outcome = np.array(outcome).ravel()
    predict = np.array(predict).ravel()
    
    # Get only the NONZERO or NON-NAN elements
    EPSILON = pow(10, -5)
    idx = (np.abs(outcome) > EPSILON) | (~np.isnan(outcome)) | (~np.isnan(predict))
    
    # Extract those elements
    outcome = outcome[np.where(idx)]
    predict = predict[np.where(idx)]
    
    return np.mean(np.abs((outcome - predict) / outcome))

scores = {}

scores['MAPE'] = mape(target_test, pred_test)
        
from sklearn.metrics import r2_score
scores['R2'] = r2_score(target_test, pred_test)

from sklearn.metrics import explained_variance_score
scores['Explained Variance'] = explained_variance_score(target_test, pred_test)

from sklearn.metrics import mean_squared_error
scores['Mean Square Error'] = mean_squared_error(target_test, pred_test)
scores['Root Mean Square Error'] = np.sqrt(scores['Mean Square Error'])
    
from sklearn.metrics import median_absolute_error
scores['Median Absolute Error'] = median_absolute_error(target_test, pred_test)

print(pd.Series(scores))

In [None]:
print("Notebook ran in {:.1f} minutes".format((clock() - start_notebook)/60))