In [None]:
import numpy as np

import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

# Profiling notebook perfomance
from time import clock
start_notebook = clock()

In [None]:
df = pd.read_hdf('data/train.h5')
df.set_index('id', inplace=True)

In [None]:
df.to_csv('data/train.csv')

In [None]:
excl = ['id', 'sample', 'y', 'timestamp']
cols = [c for c in df.columns if c not in excl]

# Exploration

In [None]:
df.head()

In [None]:
print(df.shape)
print(df.columns)

from collections import Counter
col_prefix = [col.split('_')[0] for col in df.columns]
counter = Counter(col_prefix)
print(counter)

In [None]:
df[['fundamental_0', 'derived_0', 'technical_0']].dropna().describe()

In [None]:
# Count unique per columns
# nuniq = df.apply(pd.Series.nunique)
# print(nuniq)

In [None]:
# Count number of unique per column
# df[['fundamental_0', 'derived_0', 'technical_0']].apply(pd.Series.nunique)

In [None]:
# Number of missing values
n = df.shape[0]
nas = df.isnull().sum()/n
print("total: {:.0%}".format(nas.mean()))

print(nas * 100)

In [None]:
# target = df['y']
# df = df[col]

In [None]:
df.set_index('timestamp')['fundamental_0'].plot()

In [None]:
df.set_index('timestamp')['derived_0'].plot()

In [None]:
df.set_index('timestamp')['technical_0'].plot()

In [None]:
df.set_index('timestamp')['derived_3'].plot()

In [None]:
df.set_index('timestamp')['technical_41'].plot()

In [None]:
df.set_index('fundamental_0')['fundamental_1'].plot()

In [None]:
# Seasonal pattern?
series = df.set_index('timestamp')['fundamental_0'].ffill()
series = series.rolling(window=1000).mean()
series.plot()

In [None]:
dir()

# Prediction

In [None]:
target = df['y']
feature = df[cols].ffill().fillna(0)

In [None]:
def split_train_test(feature, target, cutoff_test = 1000):
    """
    Divide features and targets into train and test
    """

    ind_test = df.index >= cutoff_test
    feature_test = feature[ind_test]
    target_test = target[ind_test]

    ind_train = ~ind_test
    feature_train = feature[ind_train]
    target_train = target[ind_train]
    
    return feature_train, feature_test, target_train, target_test

# Apply split
feature_train, feature_test, target_train, target_test = split_train_test(feature, target)

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# LassoCV since L1 norm promotes sparsity of features
clf = LassoCV()
# sfm = SelectFromModel(clf, threshold = 1e-7)
sfm = SelectFromModel(clf, threshold = "mean")
sfm.fit(feature_train, target_train)
# NOTE had to disable mkl as discussed here: https://github.com/BVLC/caffe/issues/3884

feature_kept = feature.columns[sfm.get_support()]
print("Features: {}".format(feature_kept))

In [None]:
# Keep only most important features
feature_train = pd.DataFrame(sfm.transform(feature_train), 
                             columns = feature_kept, index = feature_train.index)
feature_test = pd.DataFrame(sfm.transform(feature_test), 
                            columns = feature_kept, index = feature_test.index)

In [None]:
# Linear regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

# Quick cross validation
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(reg, feature, target, cv = 5)
print("R^2 during CV: {:.2f} +/- {:.2f}".format(scores.mean(), scores.std() * 2))

scores

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators = 10)

reg.train(feature_train, target_train)
pred_test = reg.predict(feature_test)

In [None]:
def mape(outcome, predict):
    """
    Compute Mean Absolute Percentage Error (MAPE) score. Positive, but lower is better.
    """
    
    outcome = np.array(outcome).ravel()
    predict = np.array(predict).ravel()
    
    # Get only the NONZERO or NON-NAN elements
    EPSILON = pow(10, -5)
    idx = (np.abs(outcome) > EPSILON) | (~np.isnan(outcome)) | (~np.isnan(predict))
    
    # Extract those elements
    outcome = outcome[np.where(idx)]
    predict = predict[np.where(idx)]
    
    return np.mean(np.abs((outcome - predict) / outcome))

scores['MAPE'] = mape(target_test, pred_test)
        
from sklearn.metrics import r2_score
scores['R2'] = r2_score(target_test, pred_test)

from sklearn.metrics import explained_variance_score
scores['Explained Variance'] = explained_variance_score(target_test, pred_test)

from sklearn.metrics import mean_squared_error
scores['Mean Square Error'] = mean_squared_error(target_test, pred_test)
scores['Root Mean Square Error'] = np.sqrt(self.mse)
    
from sklearn.metrics import median_absolute_error
scores['Median Absolute Error'] = median_absolute_error(target_test, pred_test)

from pprint import pprint
pprint(scores)
# print(pd.Series(scores, name = 'Scores'))