In [1]:
# Set up Notebook
% matplotlib inline

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [5]:
def make_lr_plot(ind_train, dep_train, ind_test, dep_test, results):

    # Set up plot area
    sns.set(style="white")
    fig, axs = plt.subplots(figsize=(10, 8), nrows=2, ncols=1, 
                            sharex=True)

    # Add space between plots
    fig.subplots_adjust(hspace=0.5)

    # Plot the training and testing data
    axs[0].scatter(ind_train, dep_train, label='Training Data',
               alpha = .5, cmap=cm.coolwarm)
    axs[0].scatter(ind_test, dep_test, label='Testing Data',
               alpha = .5, cmap=cm.coolwarm)

    # Plot model prediction
    axs[0].plot(ind_test, results, label='Model', c='r', alpha = .25)
    
    # Ensure equal axis
    axs[0].set_aspect('equal')

    # Decorate final plot
    axs[0].set_xlabel("Total Bill", fontsize=14)
    axs[0].set_ylabel("Tip", fontsize=14)
    axs[0].set_title("Regression Plot", fontsize=18)
    axs[0].set_xlim(-2, 52)
    axs[0].set_ylim(-1, 10)
    axs[0].legend()    
    sns.despine(ax=axs[0], trim=True)
    
    # Plot model residuals
    axs[1].scatter(ind_test, dep_test - results, label='Testing Data',
               alpha = .5, cmap=cm.coolwarm)

    # Show zero residual line
    axs[1].hlines(0, 0, 50, color='r', linestyle='--', alpha=0.25)

    # Decorate final plot
    axs[1].set_xlabel("Total Bill", fontsize=14)
    axs[1].set_ylabel("Residual", fontsize=14)
    axs[1].set_title("Regression Plot (model residuals)", fontsize=18)
    axs[1].set_ylim(-5, 5)
    sns.despine(ax=axs[1], trim=True)

In [9]:
logFCS = pd.read_csv('logFCS_predict_CLUST.csv')
pred = logFCS['clust_logFCS_predict']
actual = logFCS['clust_logFCS']
r2 = stats.pearsonr(actual, pred)[0] ** 2
r2

0.54028478011979253

In [10]:
logFCS2 = pd.read_csv('logFCS_predict_CLUST_dum.csv')
pred = logFCS2['clust_logFCS_predict']
actual = logFCS2['clust_logFCS']
r2 = stats.pearsonr(actual, pred)[0] ** 2
r2

0.4316849862464045

In [4]:
HDDS = pd.read_csv('HDDS_predict_CLUST.csv')
HDDS = HDDS.dropna(axis=0)
pred = HDDS['clust_HDDS_predict']
actual = HDDS['clust_HDDS']
r2 = stats.pearsonr(actual, pred)[0] ** 2
r2

0.62353906214700816

In [6]:
HDDS2 = pd.read_csv('HDDS_predict_CLUST_dum.csv')
HDDS2 = HDDS2.dropna(axis=0)
pred = HDDS2['clust_HDDS_predict']
actual = HDDS2['clust_HDDS']
r2 = stats.pearsonr(actual, pred)[0] ** 2
r2

0.50256326879710767

In [7]:
RCSI = pd.read_csv('RCSI_predict_CLUST.csv')
RCSI = RCSI.dropna(axis=0)
pred = RCSI['clust_RCSI_predict']
actual = RCSI['clust_RCSI']
r2 = stats.pearsonr(actual, pred)[0] ** 2
r2

0.16869497224541516

In [8]:
RCSI = pd.read_csv('RCSI_predict_CLUST_dum.csv')
RCSI = RCSI.dropna(axis=0)
pred = RCSI['clust_RCSI_predict']
actual = RCSI['clust_RCSI']
r2 = stats.pearsonr(actual, pred)[0] ** 2
r2

0.07697321841426441

In [33]:
RCSI_tobit = pd.read_csv('cluster_predict/RCSI_tobit_CLUST.csv')

RCSI2_tobit = RCSI_tobit.dropna(axis=0)
RCSI2_tobit.head()
 
pred = RCSI2_tobit['RCSI_predict']
actual = RCSI2_tobit['RCSI']
r2 = stats.pearsonr(actual, pred)[0] ** 2
r2

0.021030980463290768

In [45]:
RCSI_after = pd.read_csv('RCSI_predict_CLUST_after.csv')

RCSI_after2 = RCSI_after.dropna(axis=0)
RCSI_after2.head()
print(len(RCSI_after))
print(len(RCSI_after2))
pred = RCSI_before2['clust_RCSI_predict']
actual = RCSI_before2['clust_RCSI']
# Copute performance metrics
mae = mean_absolute_error(actual, pred)
mse = mean_squared_error(actual, pred)
mbe = median_absolute_error(actual, pred)
mr2 = r2_score(actual, pred)

from sklearn import metrics

# Classify test data and display score and report
 
ev_score = explained_variance_score(actual, pred)

# Display metrics
print(f'Mean Absolute Error   = {mae:4.5f}')
print(f'Mean Squared Error    = {mse:4.5f}')
print(f'Median Absolute Error = {mbe:4.5f}')
print(f'R^2 Score             = {mr2:5.3f}')
print(f'Explained Variance    = {ev_score:5.3f}')

204
156
Mean Absolute Error   = 1.62429
Mean Squared Error    = 4.09800
Median Absolute Error = 1.32511
R^2 Score             = 0.417
Explained Variance    = 0.455
