## READMEClassifier Model Replication

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
FULL_X = "/FileStore/tables/FULL_X.csv"
FULL_Y = "/FileStore/tables/TARGET_MATRIX_YTRUE_FULL.csv"
file_type = "csv"
first_row_is_header = "true"
delimiter = ","

X = spark.read.format(file_type) \
  .option("inferSchema",True) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(FULL_X)

Y = spark.read.format(file_type) \
  .option("inferSchema",True) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(FULL_Y)

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator #10-fold cross validation

In [0]:
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from joblib import Parallel
from joblib import delayed
from sklearn.utils import resample
from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator, clone
import warnings
import configparser
import logging
import pandas
from pandas import DataFrame
import numpy as np
import sqlite3
from sqlite3 import Error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import time
import operator
import pandas as pd

In [0]:
def fit(self, X, y):
    self.y_ = y
    return self
    
def _fit_binary(estimator, X, y, classes=None):
    """Fit a single binary estimator."""
    unique_y = np.unique(y)
    if len(unique_y) == 1:
        if classes is not None:
            if y[0] == -1:
                c = 0
            else:
                c = y[0]
            warnings.warn("Label %s is present in all training examples." %
                            str(classes[c]))
        estimator = _ConstantPredictor().fit(X, unique_y)
    else:
        estimator = clone(estimator)
        estimator.fit(X, y)
    return estimator

class OneVsRestClassifierBalance(OneVsRestClassifier):    
    def fit(self, X, y):
        self.label_binarizer_ = LabelBinarizer(sparse_output=True)
        Y = self.label_binarizer_.fit_transform(y)
        Y = Y.tocsc()
        self.classes_ = self.label_binarizer_.classes_
        totalIns = Y.shape[0]
        XBal = []
        YBal = []
        for i in range(len(self.label_binarizer_.classes_)):
            if len(y.shape)>1:
                curIdxs = Y[:,i].nonzero()[0]
            else:
                curIdxs = Y.nonzero()[0]
            baseX = X[curIdxs,:]
            if len(y.shape)>1:
                baseY = y[curIdxs,:]
            else:
                baseY = y[curIdxs]
            tempX = X
            tempY = y
            imbalancedIns = baseX.shape[0]
            numDup = totalIns/imbalancedIns - 1
            for j in range(int(numDup)):
                tempX = np.vstack((tempX,baseX))
                if len(y.shape)>1:
                    tempY = np.vstack((tempY,baseY))
                else:
                    tempY = np.concatenate((tempY, baseY))
            numAdd = totalIns%imbalancedIns
            tempX = np.vstack((tempX,resample(baseX,n_samples=numAdd,random_state=0)))
            if len(y.shape)>1:
                tempY = np.vstack((tempY,resample(baseY,n_samples=numAdd,random_state=0)))
            else:
                tempY = np.concatenate((tempY,resample(baseY,n_samples=numAdd,random_state=0)))
            XBal.append(tempX)
            if len(y.shape)>1:
                YBal.append(tempY[:,i])
            else:
                YBal.append(tempY)
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
            self.estimator, XBal[i], YBal[i], classes=[
                "not %s" % self.label_binarizer_.classes_[i],
                self.label_binarizer_.classes_[i]])
             for i in range(len(YBal)))
        return self

In [0]:
X_pd = X.toPandas()
Y_pd = Y.toPandas()

In [0]:
print(X_pd.shape)
print(Y_pd.shape)

(4331, 13350)
(4331, 8)


In [0]:
svc = SVC() # Can be replaced with random forest or logistic regression models
classifier = OneVsRestClassifierBalance(svc) 

In [0]:
print('Running SVC (SKLEARN) Experiment')         
y_pred = cross_val_predict(classifier, X_pd.values, Y_pd.values, cv=10)
print('SUCCESS!')

Running SVC (SKLEARN) Experiment


In [0]:
y_pred_df = pd.DataFrame(y_pred)

In [0]:
display(y_pred_df)

In [0]:
print(classification_report(Y_pd.values, y_pred, digits=3))

In [0]:
print('Computing weighted f1 score.')
scores_f1 = cross_val_score(classifier, X_pd.values, Y_pd.values, cv=10, scoring='f1_weighted').mean()
print('f1_weighted : {0}'.format(scores_f1))