In [None]:
from __future__ import print_function
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from treeinterpreter import treeinterpreter as ti

mpl.style.use('ggplot')
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, f1_score, precision_score, \
                            recall_score, accuracy_score, recall_score, \
                            roc_curve, auc, roc_auc_score, precision_recall_curve

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

import config

from collections import Counter, OrderedDict, defaultdict
import operator
import time
from pprint import pprint as pp

%matplotlib inline
np.random.seed(42)
pd.options.display.max_columns = None
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load training data into memory
df = pd.read_csv('nifty_train_data.csv')
df.head()

In [None]:
# Capture 30 return data into y 
y = df['Returns_30day']
Counter(y)

# Drop 30 day return column
# We know this data already based on around 30 day return
# We ll use this data to predict 30 day return.
df.drop(['Returns_30day'], axis=1, inplace=True)


In [None]:
# Capture dataframe values in x
X = df.values
# table dimensions
X.shape

In [None]:
# table dimensions
y.shape

In [None]:
# Train data on 80% and test of rest 20%
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
#from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
#from tpot.builtins import StackingEstimator

# Create big array
x1 = X[:80000]
y1 = y[:80000]
# Creating training and testing data
training_features, testing_features, training_target, testing_target = train_test_split(x1, y1, 
                                                                                        train_size=0.80, 
                                                                                        test_size=0.20,
                                                                                        stratify=y1,
                                                                                        random_state=42)
# Over-sampling training data
X_resampled, y_resampled = SMOTE().fit_sample(training_features, training_target)

normalizer = StandardScaler()
X_resampled = normalizer.fit_transform(X_resampled)
testing_features = normalizer.transform(testing_features)

# Training Model
exported_pipeline = RandomForestClassifier(bootstrap=False, max_features='auto', 
                                           min_samples_leaf=1, min_samples_split=5, 
                                           n_estimators=80, 
                                           criterion='gini',
                                           n_jobs=-1)

exported_pipeline.fit(X_resampled, y_resampled)
y_pred = exported_pipeline.predict(testing_features)

print(time.ctime())

print('Training test accuracy:', accuracy_score(testing_target, y_pred))

In [None]:
# validate data test
validation_testing_features = normalizer.transform(X[:80000])
validation_y_actual = y[:80000]
validation_y_pred = exported_pipeline.predict(validation_testing_features)
print('Validation test accuracy:', accuracy_score(validation_y_actual, validation_y_pred))

In [None]:
# Print pipeline used in RandomForest
exported_pipeline

In [None]:
# Print indicator importance value

def full_form(x):
    try:
        return config.tickers[x]
    except:
        return x

importance = exported_pipeline.feature_importances_
feature_set = {
    'Feature_name': df.columns,
    'Importance': importance
}

f = pd.DataFrame(feature_set).sort_values(['Importance'], ascending=False)
f['full_name'] = f.Feature_name.apply(full_form)
f['Importance'] = f['Importance'].apply(lambda x: np.round(x,4)) 
bad_features = f[f.Importance < 0.01].Feature_name
print('Low importance features:', bad_features.values)
f.to_csv('feature_importance.csv')
f



In [None]:
# Print confusion plot

def confusion_plot(y_test, y_predicted, classes):
    cm = confusion_matrix(y_test, y_predicted)

    # Scaling values
    cm = cm.astype('float')*100 / cm.sum(axis=1)[:, np.newaxis] 
    np.set_printoptions(suppress=True)
    mpl.rc("figure", figsize=(7,7)) #subplot size

    hm = sns.heatmap(cm, 
                cbar=False,
                annot=True, 
                square=True,
                yticklabels=classes,
                xticklabels=classes,
                cmap='Blues',
                linewidths =.5,
                annot_kws={'size':14} #text size
                )
    plt.title('Prediction matrix')
    plt.ylabel('Actual class')
    plt.xlabel('Predicted class')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png', dpi=100)
    plt.show()

confusion_plot(validation_y_actual, validation_y_pred,[-5,-4,-3,-2,-1,0,1,2,3,4,5])


In [None]:
# populate predicated and probability values
predicted = np.argmax(exported_pipeline.predict_proba(validation_testing_features), axis=1) - 5
prob = np.max(exported_pipeline.predict_proba(validation_testing_features), axis=1)

In [None]:
# Check how predication faired with actual returns
cutoff_prob = 0.4
cutoff_returns = 2

def _color_red_or_green(val):
    color = 'red' if val < 0.5 else 'green'
    return 'color: %s' % color

df_result = pd.DataFrame({
    'predicted': validation_y_pred,
    'actual_returns': validation_y_actual,
    'prob': prob
})


df_result_final = df_result[(np.absolute(df_result.predicted)>=cutoff_returns) 
                            & (df_result.prob>=cutoff_prob) 
#                             &(df_result.predicted==df_result.true_label)
                           ]
print(len(df_result_final)/len(df_result))
df_result_final.style.applymap(_color_red_or_green, subset=['prob'])
#df_result_final.head()

In [None]:
# lets trade using this data
trade_amount = 1000
#transaction_charge = 0.25/100*2
#crypto_slippage = 0.97**2
#currency_slippage = 0.995**2

def returns(pred, true_):
    if pred>=2: 
        if true_>=pred:
            return pred
        else:
            return true_
    if pred<=-2:
        if true_<=pred:
            return -pred
        else:
            return -true_

df_result_final['Action'] = df_result_final['predicted'].apply(lambda x: 'Long' if x>0 else 'Short')
df_result_final['realised_returns'] = df_result_final.apply(lambda row: returns(row['predicted'], row['actual_returns']), axis=1)
df_result_final['profit'] = df_result_final.apply(lambda row: row['prob']*row['realised_returns']*np.sqrt(np.absolute(row['predicted'])/2)*0.01*trade_amount, axis=1)
print('Net profit:', df_result_final['profit'].sum())
# Display top 10 trades
df_result_final.head(n=10)


In [None]:
# lets plot actual and predicated returns on a chart
confusion_plot(df_result_final.actual_returns, df_result_final.predicted, [-5,-4,-3,-2,-1,0,1,2,3,4,5])

In [None]:
# What went wrong
print('Wrong predictions')
df_result_final[df_result_final.actual_returns!=df_result_final.predicted].head(n=30)

In [None]:
# How did we lose
print('Lost money')
df_result_final[df_result_final.realised_returns<0].head(n=30)

In [None]:
# How much did we make
print('Net profit:', df_result_final['profit'].sum())

In [None]:
# How long did we trade
print('Number of days', validation_y_pred.shape[0]/96/4)