In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Load in data
df = pd.read_csv('Resources/Data_ETFs_to_use.csv')

In [3]:
df = df.dropna()

In [4]:
df = df.set_index("Unnamed: 0")

In [5]:
df.head()


Unnamed: 0_level_0,LQD,LQD.1,LQD.2,LQD.3,LQD.4,PZA,PZA.1,PZA.2,PZA.3,PZA.4,...,VMBS,VMBS.1,VMBS.2,VMBS.3,VMBS.4,VNQ,VNQ.1,VNQ.2,VNQ.3,VNQ.4
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-05-02 00:00:00-04:00,110.05,110.43,109.96,110.31,1374460.0,22.77,22.81,22.74,22.75,208419,...,50.7348,50.79,50.73,50.77,9903.0,62.13,62.36,61.4,61.9,2324829
2011-05-03 00:00:00-04:00,110.33,110.4,110.09,110.25,648982.0,22.76,22.85,22.76,22.82,83321,...,50.81,50.87,50.738,50.859,3887.0,61.74,62.19,60.69,61.39,1828226
2011-05-04 00:00:00-04:00,110.46,110.59,110.31,110.4,656364.0,22.86,22.99,22.8115,22.94,140827,...,50.83,50.91,50.76,50.8176,3146.0,61.29,61.49,60.75,61.05,1577434
2011-05-05 00:00:00-04:00,110.51,110.88,110.51,110.79,761126.0,22.99,23.05,22.948,23.03,256741,...,50.92,51.02,50.92,50.988,1731.0,60.71,61.45,60.52,60.97,1846927
2011-05-06 00:00:00-04:00,110.56,111.03,110.42,110.77,830286.0,23.05,23.07,23.0,23.0,109594,...,51.01,51.12,50.988,51.04,11474.0,61.5,61.55,60.29,60.45,2121920


In [6]:
# Define features set
X = df.copy()
X.drop("LQD", axis=1, inplace=True)
X.head()

Unnamed: 0_level_0,LQD.1,LQD.2,LQD.3,LQD.4,PZA,PZA.1,PZA.2,PZA.3,PZA.4,QQQ,...,VMBS,VMBS.1,VMBS.2,VMBS.3,VMBS.4,VNQ,VNQ.1,VNQ.2,VNQ.3,VNQ.4
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-05-02 00:00:00-04:00,110.43,109.96,110.31,1374460.0,22.77,22.81,22.74,22.75,208419,59.19,...,50.7348,50.79,50.73,50.77,9903.0,62.13,62.36,61.4,61.9,2324829
2011-05-03 00:00:00-04:00,110.4,110.09,110.25,648982.0,22.76,22.85,22.76,22.82,83321,58.95,...,50.81,50.87,50.738,50.859,3887.0,61.74,62.19,60.69,61.39,1828226
2011-05-04 00:00:00-04:00,110.59,110.31,110.4,656364.0,22.86,22.99,22.8115,22.94,140827,58.72,...,50.83,50.91,50.76,50.8176,3146.0,61.29,61.49,60.75,61.05,1577434
2011-05-05 00:00:00-04:00,110.88,110.51,110.79,761126.0,22.99,23.05,22.948,23.03,256741,58.31,...,50.92,51.02,50.92,50.988,1731.0,60.71,61.45,60.52,60.97,1846927
2011-05-06 00:00:00-04:00,111.03,110.42,110.77,830286.0,23.05,23.07,23.0,23.0,109594,58.86,...,51.01,51.12,50.988,51.04,11474.0,61.5,61.55,60.29,60.45,2121920


In [7]:
# Define target vector
y = df["LQD"].values.reshape(-1, 1)
y[:5]


array([['110.05'],
       ['110.33'],
       ['110.46'],
       ['110.51'],
       ['110.56']], dtype=object)

In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Create the StandardScaler instance
scaler = StandardScaler()




In [10]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)


In [11]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)


In [13]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  rf_model = rf_model.fit(X_train_scaled, y_train)


In [14]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [15]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)



In [16]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


pandas.core.frame.DataFrame

Accuracy Score : 0.02127659574468085
Classification Report
              precision    recall  f1-score   support

      108.87       0.00      0.00      0.00         1
      109.08       0.00      0.00      0.00         0
      109.72       0.00      0.00      0.00         1
      109.95       0.00      0.00      0.00         0
      110.07       0.00      0.00      0.00         1
      110.18       0.00      0.00      0.00         0
      110.45       0.00      0.00      0.00         0
      110.53       1.00      1.00      1.00         1
      110.59       0.00      0.00      0.00         0
      110.64       0.00      0.00      0.00         0
      110.73       0.00      0.00      0.00         1
      110.78       0.00      0.00      0.00         0
       110.8       0.00      0.00      0.00         1
      110.85       0.00      0.00      0.00         1
      110.91       0.00      0.00      0.00         0
      110.95       0.00      0.00      0.00         1
       111.0       0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Get the feature importance array
importances = rf_model.feature_importances_


In [22]:
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.04513471654942547, 'LQD.2'),
 (0.04491010646933096, 'LQD.1'),
 (0.0390070538253574, 'LQD.3'),
 (0.03601999768892744, 'PZA.4'),
 (0.03378067108926635, 'TLT.4'),
 (0.033744758981657935, 'VNQ.4'),
 (0.03338170437810273, 'QQQ.4'),
 (0.03267675881546222, 'LQD.4'),
 (0.03226099675557, 'VMBS.4'),
 (0.03220994067657144, 'USO.4')]

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

RSEED = 50

# Extract the labels
labels = np.array(df.pop('LQD.1'))

# 30% examples in test data
train, test, train_labels, test_labels = train_test_split(df,
                                         labels, 
                                         test_size = 0.3, 
                                         random_state = RSEED)

In [25]:
# Imputation of missing values
train = train.fillna(train.mean())
test = test.fillna(test.mean())

# Features for feature importances
features = list(train.columns)

# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, 
                               random_state=RSEED, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)

In [26]:
# Fit on training data
model.fit(train, train_labels)


n_nodes = []
max_depths = []

# Stats about the trees in random forest
for ind_tree in model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.3s finished


Average number of nodes 2039
Average maximum depth 78


In [27]:
# Training predictions (to demonstrate overfitting)
train_rf_predictions = model.predict(train)
train_rf_probs = model.predict_proba(train)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.4s finished


In [28]:
# Testing predictions (to determine performance)
rf_predictions = model.predict(test)
rf_probs = model.predict_proba(test)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.5s finished


In [29]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Plot formatting
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 18

def evaluate_model(predictions, probs, train_predictions, train_probs):
    """Compare machine learning model to baseline performance.
    Computes statistics and shows ROC curve."""
    
    baseline = {}
    
    baseline['recall'] = recall_score(test_labels, 
                                     [1 for _ in range(len(test_labels))])
    baseline['precision'] = precision_score(test_labels, 
                                      [1 for _ in range(len(test_labels))])
    baseline['roc'] = 0.5
    
    results = {}
    
    results['recall'] = recall_score(test_labels, predictions)
    results['precision'] = precision_score(test_labels, predictions)
    results['roc'] = roc_auc_score(test_labels, probs)
    
    train_results = {}
    train_results['recall'] = recall_score(train_labels, train_predictions)
    train_results['precision'] = precision_score(train_labels, train_predictions)
    train_results['roc'] = roc_auc_score(train_labels, train_probs)
    
    for metric in ['recall', 'precision', 'roc']:
        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
    
    # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(test_labels, [1 for _ in range(len(test_labels))])
    model_fpr, model_tpr, _ = roc_curve(test_labels, probs)

    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16
    
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate'); 
    plt.ylabel('True Positive Rate'); plt.title('ROC Curves');
    plt.show()
    
evaluate_model(rf_predictions, rf_probs, train_rf_predictions, train_rf_probs)

ValueError: Mix of label input types (string and number)

In [None]:
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    # Plot the confusion matrix
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)

# Confusion matrix
cm = confusion_matrix(test_labels, rf_predictions)
plot_confusion_matrix(cm, classes = ['Poor Health', 'Good Health'],
                      title = 'Health Confusion Matrix')

plt.savefig('cm.png')

Confusion matrix, without normalization
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]
