In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectKBest

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

In [3]:
# load dataset
data = pd.read_csv('stock_data.csv')
data.shape

(3000, 101)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Columns: 101 entries, X1 to Y
dtypes: float64(100), int64(1)
memory usage: 2.3 MB


In [5]:
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Y'], axis=1),
    data['Y'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((2100, 100), (900, 100))

In [6]:
# Copy the original dataset
X_train_orig = X_train.copy()
X_test_orig = X_test.copy()

In [7]:
# Removing constant features
constant_features = [var for var in X_train.columns if X_train[var].std() == 0]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((2100, 100), (900, 100))

In [8]:
# Removing quasi-constant features.

# defining VarianceThreshold
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train) 
sum(sel.get_support()) 

features = X_train.columns[sel.get_support()]
# remove the features

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

In [11]:
X_train= pd.DataFrame(X_train)
X_train.columns = features

X_test= pd.DataFrame(X_test)
X_test.columns = features

In [12]:
# Removing Duplicated features

duplFeatures = []
for i in range(0, len(X_train.columns)):
    oneCol = X_train.columns[i]
    for othCol in X_train.columns[i + 1:]:
        if X_train[oneCol].equals(X_train[othCol]):
            duplFeatures.append(othCol)
            
X_train.drop(labels=duplFeatures, axis=1, inplace=True)
X_test.drop(labels=duplFeatures, axis=1, inplace=True)

In [13]:
X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()

In [16]:
# Removing correlated features

col_corr = set()  
corr_matrix = data.corr()
    
for i in range(len(corr_matrix.columns)):
      for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            colname = corr_matrix.columns[i]  
            col_corr.add(colname)

X_train.drop(labels=col_corr, axis=1, inplace=True)
X_test.drop(labels=col_corr, axis=1, inplace=True)

In [17]:
# keep a copy of the dataset at  this stage
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

In [18]:
# Compare the performances

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [19]:
# Performance of original
run_randomForests(X_train_orig,
                  X_test_orig,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.7404931196717252
Test set
Random Forests roc-auc: 0.5411431027922857


In [20]:
# Performance of correlated
run_randomForests(X_train_corr,
                  X_test_corr,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.7278862140371214
Test set
Random Forests roc-auc: 0.5390875535505166


In [21]:
# Performance of (quasi) - constants, duplicated
run_randomForests(X_train_basic_filter,
                  X_test_basic_filter,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.7404931196717252
Test set
Random Forests roc-auc: 0.5411431027922857
