In [1]:
import numpy as np
import scipy
from scipy import stats
import pandas as pd

import math
import torch
import gpytorch

import dash
import plotly.graph_objs as go
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State

from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import OneHotEncoder
import pickle

from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt

TRAINING_ITER = 20
NUM_TREE = 10000
TREE_MAX_DEPTH = 12

%matplotlib inline
%load_ext autoreload
%autoreload 2


The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).





In [2]:
def read_csv(FilePath, delimiter = ',', ):
    ''' Read the csv file from FilePath, delimiter of columns ',' '''
    presidential = pd.read_csv(FilePath, delimiter = delimiter)
    
    return presidential

In [3]:
def data_cleaning(presidential):
    ''' separate features and labels, democratic party labels: column q6, republican party labels: column q7'''
    
    df = presidential[['greek', 'athlete', 'financialAid',
       'gender', 'geography', 'highschool', 'legacy', 'major', 'orientation',
       'race', 'year', 'school', 'q5', 'q3', 'q1', 'q7', 'q4', 'q2', 'q6',
       'q8']]
    df = df[df['q6'] != 5.0]
    df = df[df['q7'] != 5.0]
    df = df.dropna(subset = ['major','gender','orientation','geography','q2', 'q3', 'q4', 'q5', 'q7', 'q8'])
    
    ### Increasing values represent higher favorability
    df['q6'].replace([1.0, 2.0, 3.0, 4.0], [18.0, 17.0, 16.0, 15.0], inplace=True)
    df['q6'].replace([15.0, 16.0, 17.0, 18.0], [1.0, 2.0, 3.0, 4.0], inplace=True)
    df['q7'].replace([1.0, 2.0, 3.0, 4.0], [18.0, 17.0, 16.0, 15.0], inplace=True)
    df['q7'].replace([15.0, 16.0, 17.0, 18.0], [1.0, 2.0, 3.0, 4.0], inplace=True)
    
    y_demo = df[['q6']]
    y_repu = df[['q7']]
    X_features = df[['major','gender','orientation','geography','q2', 'q3', 'q4', 'q5', 'q8']]
    
    feature_names_value = X_features.columns.tolist()
    feature_names_key = range(len(feature_names_value))
    
    return X_features, y_demo, y_repu, dict(zip(feature_names_key, feature_names_value))

In [4]:
def feature_dict(X_features):
    ''' Return 
        (1) total number of unique features
        (2) feature number dictionary; key - feature(column) name, value - unique number of feature values
        (3) feature to order dictionary
        (4) order to feature dictionary
    '''
    featurelist = X_features.columns.tolist()
    sum_feature = 0
    feature_number_dict = {}
    feature_to_order_dict = {}
    order_to_feature_dict = {}
    for column in featurelist:
        # count how many unique values this feature has
        column_ft_number = len(X_features[column].unique())
        
        # add the number of unique feature values to the total feature count
        sum_feature += column_ft_number
        
        # create an entry for this feature, key = feature name, value = number of the unique feature values
        feature_number_dict[column] = column_ft_number
        
        a = list(X_features[column].unique().astype(int))
        b = range(column_ft_number)
        # map feature # to order
        feature_to_order_dict[column] = dict(zip(a, b))
        # map order to feature #
        order_to_feature_dict[column] = dict(zip(b, a))
    
    return sum_feature, feature_number_dict, feature_to_order_dict, order_to_feature_dict

In [5]:
def onehot_encode(X_features):
    ''' One Hot encode features to prepare for the SMOTE oversampling '''
    
    sum_feature, feature_number_dict, feature_to_order_dict, order_to_feature_dict = feature_dict(X_features)
    onehot = np.array([None for i in range(len(X_features))])
    onehot = np.reshape(onehot, (len(X_features),-1))
    featurelist = X_features.columns.tolist()
    for column in featurelist:
        X_column = X_features[column]
        X_column = X_column.astype(int)
        k = feature_number_dict[column]
        
        X_onehot = [[0 for j in range(k)] for i in range(len(X_column))]
        for count, ele in enumerate(X_column):
            b = feature_to_order_dict[column][ele]
            X_onehot[count][b-1] = 1
        onehot = np.concatenate((onehot, X_onehot), axis=1)
    return onehot[:, 1:]

In [6]:
def smote(onehot, ylabel):
    ''' SMOTE oversampling '''
    
    sm = SMOTE()
    X_smote, y_smote = sm.fit_sample(onehot, ylabel)
    return X_smote, y_smote

In [7]:
def onehot_decode(X_smote, order_to_feature_dict, feature_number_dict):
    ''' Decode One Hot features after the SMOTE oversampling back to the original feature format '''
    
    onehot_de = np.array([None for i in range(len(X_smote))])
    onehot_de = np.reshape(onehot_de, (len(X_smote),-1))
        
    # sum_feature, feature_number_dict, feature_to_order_dict, order_to_feature_dict = feature_dict(X_features)
    for key, value in feature_number_dict.items():
        X = X_smote[:, :value]
        X_smote = X_smote[:, value:]
        
        # find the column index with the largest value 
        to_process = np.argmax(X, axis=1)
        X_res = []
        for item in to_process:
            X_res.append(order_to_feature_dict[key][item])
        onehot_de = np.concatenate((onehot_de, np.array([X_res]).T), axis=1)
    
    return onehot_de[:, 1:]   

In [8]:
def smote_de(X_smote, feature_number_dict):
    ''' Decode One Hot features after the SMOTE oversampling back to the original feature format '''
    
    smote_de = np.array([None for i in range(len(X_smote))])
    smote_de = np.reshape(smote_de, (len(X_smote),-1))
    m,n = X_smote.shape
    smote_res = [[0 for j in range(n)] for i in range(m)]
    
    # sum_feature, feature_number_dict, feature_to_order_dict, order_to_feature_dict = feature_dict(X_features)
    for key, value in feature_number_dict.items():
        X = X_smote[:, :value]
        X_smote = X_smote[:, value:]
        
        # find the column index with the largest value 
        to_process = np.argmax(X, axis=1)
        # X_res = []
        for i, item in enumerate(to_process):
            smote_res[i][item] = 1
        
    return smote_res

Training Validation split

In [9]:
# split train_val set and test set from all the original data
def train_val_split(X_features, y_demo, y_repu):
    ''' Split training and validation sets 
        Return
        (1) features - X_train, X_val
        (2) labels - y_train_demo, y_val_demo (labels - democratic party favorability)
                     y_train_repu, y_val_demo (labels - republican party favorability)
    '''
    
    X_train, X_val, y_train_demo, y_val_demo = train_test_split(X_features, y_demo, test_size=0.15, random_state = 24)
    y_train_repu = y_repu.loc[y_train_demo.index]
    y_val_repu = y_repu.loc[y_val_demo.index]
    
    X_train = X_train.astype(int)
    X_val = X_val.astype(int)
    y_train_demo = y_train_demo.astype(int)
    y_val_demo = y_val_demo.astype(int)
    y_train_repu = y_train_repu.astype(int)
    y_val_repu = y_val_repu.astype(int)    
    
    return X_train, X_val, y_train_demo, y_val_demo, y_train_repu, y_val_repu

Random Forest Classifier for feature importance

In [10]:
# Random Forest Classifier
def rfc_train(X_train, y_train):
    ''' Training '''
    
    clf = RandomForestClassifier(n_estimators = NUM_TREE, max_depth = TREE_MAX_DEPTH,
                                 random_state = 33, criterion = 'entropy')
    clf.fit(X_train, y_train)
    
    return clf

In [11]:
def rfc_pred(X_train, y_train, X_val, y_val, classifier):
    ''' Prediction 
        Return 
        (1) prediction on training set - for training error
        (2) prediction on validation set - for validation error
    '''
 
    # prediction on training set
    y_train_pred = classifier.predict(X_train)
    cm_train = confusion_matrix(y_train, y_train_pred)
    acc_train = accuracy_score(y_train, y_train_pred)
    
    # prediction on validation set
    y_val_pred = classifier.predict(X_val)
    cm_val = confusion_matrix(y_val, y_val_pred)
    acc_val = accuracy_score(y_val, y_val_pred)
    
    return y_train_pred, cm_train, acc_train, y_val_pred, cm_val, acc_val

In [12]:
def plot_confusion_matrix(y_true, y_pred,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    '''
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    '''
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = unique_labels(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [13]:
def plot_feature_importance(classifier, X_features, feature_names):
    ''' This function plots the feature importance. '''
    
    importances = classifier.feature_importances_
    std = np.std([tree.feature_importances_ for tree in classifier.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]
    features = []
    for index in indices:
        features.append(feature_names[index])
    # Print the feature ranking
    print("Feature ranking:")

    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(X_features.shape[1]), importances[indices],
           color="r", yerr=std[indices]/np.sqrt(NUM_TREE), align="center")
    plt.xticks(range(X_features.shape[1]), features, rotation = 30)
    plt.xlim([-1, X_features.shape[1]])
    plt.show()

In [14]:
def rfc_use(X_train_demo, y_train_demo, X_train_repu, y_train_repu, feature_names):
    ''' Random forest training, pickle the models'''
    
    rfc_demo = rfc_train(X_train_demo, y_train_demo)
    rfc_repu = rfc_train(X_train_repu, y_train_repu)
    filename_demo = 'rfc_demo_model.sav'
    pickle.dump(rfc_demo, open(filename_demo, 'wb'))
    filename_repu = 'rfc_repu_model.sav'
    pickle.dump(rfc_repu, open(filename_repu, 'wb'))
    
    plot_feature_importance(rfc_demo, X_train_demo, feature_names)
    plot_feature_importance(rfc_repu, X_train_repu, feature_names)

In [15]:
def rfc_pred_2(X_train_demo, y_train_demo, X_train_repu, y_train_repu, X_val_demo, y_val_demo, X_val_repu, y_val_repu, classifier_demo, classifier_repu):
    ''' Random forest predicting '''
    
    y_train_pred, cm_train, acc_train, y_val_pred, cm_val, acc_val \
    = rfc_pred(X_train_demo, y_train_demo, X_val_demo, y_val_demo, classifier_demo)
    
    y_train_pred2, cm_train2, acc_train2, y_val_pred2, cm_val2, acc_val2 \
    = rfc_pred(X_train_repu, y_train_repu, X_val_repu, y_val_repu, classifier_repu)

    # Plot confusion matrix
    np.set_printoptions(precision=2)
    # Plot non-normalized confusion matrix
    plot_confusion_matrix(y_val_demo, y_val_pred, 
                          title='Confusion matrix, without normalization (democrats)')
    plot_confusion_matrix(y_val_repu, y_val_pred2, 
                          title='Confusion matrix, without normalization (republicans)')

    plt.show()
    

def rfc(presidential):
    X_features, y_demo, y_repu, feature_names = data_cleaning(presidential)
    X_train_val, X_test, y_train_val_demo, y_test_demo = train_test_split(X_features, y_demo, test_size = 0.15, random_state = 25)
    trvaindex = y_train_val_demo.index
    testindex = y_test_demo.index
    y_train_val_repu = y_repu.loc[trvaindex]
    y_test_repu = y_repu.loc[testindex]    
    
    onehot_enc = onehot_encode(X_features)    
    X_smote_demo, y_smote_demo = smote(onehot_enc, np.ravel(y_demo))
    X_smote_repu, y_smote_repu = smote(onehot_enc, np.ravel(y_repu))
    sum_feature, feature_number_dict, feature_to_order_dict, order_to_feature_dict = feature_dict(X_features)

    # Train and test random forest model
    X_demo = onehot_decode(X_smote_demo, order_to_feature_dict, feature_number_dict)
    X_repu = onehot_decode(X_smote_repu, order_to_feature_dict, feature_number_dict)
    
    X_train_demo, X_val_demo, y_train_demo, y_val_demo \
    = train_test_split(X_demo, y_smote_demo, test_size=0.18, random_state = 41)
    X_train_repu, X_val_repu, y_train_repu, y_val_repu \
    = train_test_split(X_repu, y_smote_repu, test_size=0.18, random_state = 42)
    
    rfc_use(X_train_demo, y_train_demo, X_train_repu, y_train_repu, feature_names)
    
    # load the models from drive
    filename_demo = 'rfc_demo_model.sav'
    rfc_demo = pickle.load(open(filename_demo, 'rb'))
    filename_repu = 'rfc_repu_model.sav'
    rfc_repu = pickle.load(open(filename_repu, 'rb'))
    
    # perform random forest prediction using the pickled models
    rfc_pred_2(X_train_demo, y_train_demo, X_train_repu, y_train_repu, X_val_demo, y_val_demo, X_val_repu, y_val_repu, rfc_demo, rfc_repu)

Gaussian Process Regression

In [16]:
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [17]:
def ExactGPModel_train(X_train, y_train):
    train_x = torch.Tensor(X_train.astype(int))
    train_y = torch.Tensor(y_train.astype(int))

    # initialize likelihood and model
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(train_x, train_y, likelihood)

    # Find optimal model hyperparameters
    model.train()
    likelihood.train()

    # Use the adam optimizer
    optimizer = torch.optim.Adam([
        {'params': model.parameters()},  # Includes GaussianLikelihood parameters
    ], lr=0.1)

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    training_iter = TRAINING_ITER
    for i in range(training_iter):
        # Zero gradients from previous iteration
        optimizer.zero_grad()
        # Output from model
        output = model(train_x)
        # Calc loss and backprop gradients
        loss = -mll(output, train_y)
        loss.backward()
        print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
            i + 1, training_iter, loss.item(),
            model.covar_module.base_kernel.lengthscale.item(),
            model.likelihood.noise.item()
        ))
        optimizer.step()
    return likelihood, model

In [18]:
def ExactGPModel_test(likelihood, model, X_val):
    test_x = torch.Tensor(X_val.astype(int))

    model.eval()
    likelihood.eval()

    # Make predictions by feeding model through likelihood
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        test_x = torch.Tensor(X_val.astype(int))
        observed_pred = likelihood(model(test_x))
    return observed_pred

In [19]:
def plot_gpr_prediction(y_true, observed_pred, toview):
    with torch.no_grad():
        # Initialize plot
        f, ax = plt.subplots(1, 1, figsize=(4, 3))

        # Get upper and lower confidence bounds
        lower, upper = observed_pred.confidence_region()

        toview = toview
        # Plot training data as black stars
        ax.plot(toview, y_true[toview], 'k')
        # Plot predictive means as blue line
        # ax.plot(toview, observed_pred.mean.numpy()[toview], 'b')
        ax.plot(toview, observed_pred.mean.numpy()[toview], 'b')
        # Shade between the lower and upper confidence bounds
        ax.fill_between(toview, lower.numpy()[toview], upper.numpy()[toview], alpha=0.5)
        ax.set_ylim([-1, 6])
        ax.legend(['Observed Data', 'Mean of Prediction', 'Confidence of Prediction'])
        ax.set_xlabel('Student ID')
        ax.set_ylabel('Party Favorability')

        mse = mse_metric(y_true[toview], observed_pred.mean.numpy()[toview])
        print('Mean squared error is:', mse)

In [20]:
def mse_metric(y_true, y_pred):
    mse = np.mean((y_true - y_pred)**2)
    return mse

In [21]:
def plot_gpr_histogram(y_true, y_true2, observed_pred, observed_pred2, toview):
    toview_range = toview
    fig, axs = plt.subplots(1,len(toview_range), figsize = (10,2.5))
    
    lower, upper = observed_pred.confidence_region()
    lower2, upper2 = observed_pred2.confidence_region()
    
    for j,i in enumerate(toview_range):
        mu_d = observed_pred.mean.numpy()[i]
        #variance_d = np.sqrt(sigma_test_d[toview_range[i]])
        variance_d = (lower[i] + upper[i])/2
        mu_r = observed_pred2.mean.numpy()[i]
        # variance_r = np.sqrt(sigma_test_r[toview_range[i]])
        variance_r = (lower2[i] + upper2[i])/2

        x = np.linspace(0.5, 4.5, 100)
        axs[j].plot(x, scipy.stats.norm.pdf(x, mu_d, variance_d), color = 'blue')
        axs[j].plot(x, scipy.stats.norm.pdf(x, mu_r, variance_r), color = 'red')
        axs[j].set_xticks([1,2,3,4])
        axs[j].axvline(x = y_true[i], color = 'blue', dashes = (3,3,3,3))
        axs[j].axvline(x = y_true2[i], color = 'red', dashes = (3,3,3,3))
        # axs[i - toview_range[0]].set_title('Student ID '+str(i))
        axs[j].set_ylim([0, 0.8])

Gaussian Process Regression

In [22]:
def gpr_train(X_train, y_train, modelname):
    gpr_demo = gpr_train(X_train_demo, y_train_demo)
    filename_demo = 'rfc_' + modelname + 'demo_gpr.sav'
    pickle.dump(gpr_demo, open(filename_demo, 'wb'))

In [23]:
def gpr_pred(X_val, y_val, model):
    gpr_pred, sigma_pred = model.predict(X_val, return_std = True)
    plot_gpr(y_val, gpr_pred, sigma_pred, range(100))

In [24]:
def gpr_train(X_train, y_train):
    kernel = DotProduct() + WhiteKernel()
    # kernel = RBF()
    gpr = GaussianProcessRegressor(kernel=kernel,random_state=20).fit(X_train, y_train)
    return gpr

In [25]:
def gpr_pred(X_val, gpr):
    gpr_pred, sigma_pred = gpr.predict(X_val, return_std=True)
    return gpr_pred, sigma_pred

In [26]:
def plot_gpr(y_true, gpr_pred, sigma_pred, toview):
    plt.figure()
    toview = toview
    plt.plot(toview, gpr_pred[toview], c = 'k', label='Prediction')
    plt.plot(toview, y_true[toview], c = 'r', label='True Labels')

    plt.fill(np.concatenate([toview, toview[::-1]]),
             np.concatenate([gpr_pred[toview] - sigma_pred[toview],
                            (gpr_pred[toview] + sigma_pred[toview])[::-1]]),
             alpha=.5, fc='g', ec='None', label='95% confidence interval')
    plt.xlabel('Student ID')
    plt.ylabel('Party Favorability')
    plt.ylim(-1, 7)
    plt.legend(loc='upper left')

In [37]:
def gpr(presidential):
    X_features, y_demo, y_repu, feature_names = data_cleaning(presidential)
    X_train_val, X_test, y_train_val_demo, y_test_demo = train_test_split(X_features, y_demo, test_size = 0.15, random_state = 25)
    trvaindex = y_train_val_demo.index
    testindex = y_test_demo.index
    y_train_val_repu = y_repu.loc[trvaindex]
    y_test_repu = y_repu.loc[testindex]
    
    
    onehot_enc = onehot_encode(X_train_val)    
    X_smote_demo, y_smote_demo = smote(onehot_enc, np.ravel(y_train_val_demo))
    X_smote_repu, y_smote_repu = smote(onehot_enc, np.ravel(y_train_val_repu))
    sum_feature, feature_number_dict, feature_to_order_dict, order_to_feature_dict = feature_dict(X_train_val)
    X_smote_demo = np.array(smote_de(X_smote_demo, feature_number_dict))
    X_smote_repu = np.array(smote_de(X_smote_repu, feature_number_dict))

    # continued
    X_train_demo, X_val_demo, y_train_demo, y_val_demo \
    = train_test_split(X_smote_demo, y_smote_demo, test_size=0.18, random_state = 41)
    X_train_repu, X_val_repu, y_train_repu, y_val_repu \
    = train_test_split(X_smote_repu, y_smote_repu, test_size=0.18, random_state = 42)

    # Train and test gaussian process regression model
    gpr_use(X_train_demo, y_train_demo, X_val_demo, y_val_demo, X_train_repu, y_train_repu, X_val_repu, y_val_repu)

    gpr_demo = gpr_train(X_train_demo, y_train_demo)
    filename_demo = 'demo_gpr.sav'
    pickle.dump(gpr_demo, open(filename_demo, 'wb'))
    
    gpr_repu = gpr_train(X_train_repu, y_train_repu)
    filename_repu = 'repu_gpr.sav'
    pickle.dump(gpr_repu, open(filename_repu, 'wb'))
    
    gpr_demo = pickle.load(open(filename_demo, 'rb'))
    gpr_pred_demo, sigma_pred_demo = gpr_pred(X_val_demo, gpr_demo)
    
    gpr_repu = pickle.load(open(filename_repu, 'rb'))
    gpr_pred_repu, sigma_pred_repu = gpr_pred(X_val_repu, gpr_repu)
    
    toview = range(100)
    plot_gpr(y_val_demo, gpr_pred_demo, sigma_pred_demo, toview)
    plot_gpr(y_val_repu, gpr_pred_repu, sigma_pred_demo, toview)

In [28]:
def gpr_use(X_train_demo, y_train_demo, X_val_demo, y_val_demo, X_train_repu, y_train_repu, X_val_repu, y_val_repu):
    ''' Train the GPR model and test on validation set, save the model '''
    
    likelihood, model = ExactGPModel_train(X_train_demo, y_train_demo)
    observed_pred = ExactGPModel_test(likelihood, model, X_val_demo)
    plot_gpr_prediction(y_val_demo, observed_pred, range(100))
    torch.save(model.state_dict(), 'model_state.pth')
    
    likelihood2, model2 = ExactGPModel_train(X_train_repu, y_train_repu)
    observed_pred2 = ExactGPModel_test(likelihood2, model2, X_val_repu)
    plot_gpr_prediction(y_val_repu, observed_pred2, range(100))
    torch.save(model2.state_dict(), 'model2_state.pth')

    plot_gpr_histogram(y_val_demo, y_val_repu, observed_pred, observed_pred2, [20,24])

In [39]:
if __name__ == '__main__':

    presidential = pd.read_csv("2020_presidential_tracker.csv", delimiter = ',')
    gpr(presidential)
    rfc(presidential)