In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Back-testing').getOrCreate()

In [3]:
df = spark.read.csv('./Data/glass.csv', inferSchema=True, header=True)

In [4]:
df.show()

+-----+-------+-----+----+----+-----+----+----+---+----+----+
|Index|     ri|   na|  mg|  al|   si|   k|  ca| ba|  fe|type|
+-----+-------+-----+----+----+-----+----+----+---+----+----+
|    1|1.52101|13.64|4.49| 1.1|71.78|0.06|8.75|0.0| 0.0|   1|
|    2|1.51761|13.89| 3.6|1.36|72.73|0.48|7.83|0.0| 0.0|   1|
|    3|1.51618|13.53|3.55|1.54|72.99|0.39|7.78|0.0| 0.0|   1|
|    4|1.51766|13.21|3.69|1.29|72.61|0.57|8.22|0.0| 0.0|   1|
|    5|1.51742|13.27|3.62|1.24|73.08|0.55|8.07|0.0| 0.0|   1|
|    6|1.51596|12.79|3.61|1.62|72.97|0.64|8.07|0.0|0.26|   1|
|    7|1.51743| 13.3| 3.6|1.14|73.09|0.58|8.17|0.0| 0.0|   1|
|    8|1.51756|13.15|3.61|1.05|73.24|0.57|8.24|0.0| 0.0|   1|
|    9|1.51918|14.04|3.58|1.37|72.08|0.56| 8.3|0.0| 0.0|   1|
|   10|1.51755| 13.0| 3.6|1.36|72.99|0.57| 8.4|0.0|0.11|   1|
|   11|1.51571|12.72|3.46|1.56| 73.2|0.67|8.09|0.0|0.24|   1|
|   12|1.51763| 12.8|3.66|1.27|73.01| 0.6|8.56|0.0| 0.0|   1|
|   13|1.51589|12.88|3.43| 1.4|73.28|0.69|8.05|0.0|0.24|   1|
|   14|1

In [5]:
df.columns

['Index', 'ri', 'na', 'mg', 'al', 'si', 'k', 'ca', 'ba', 'fe', 'type']

In [6]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
import plotly.graph_objects as go


import pandas as pd
import numpy as np
# from pysparkling.ml import H2ODRF, H2OGLM, H2OXGBoost, H2ODeepLearning

In [7]:
assembler = VectorAssembler(inputCols=['ri', 'na', 'mg', 'al', 'si', 'k', 'ca', 'ba', 'fe'], outputCol='features')
df2 = assembler.transform(df)
final_data = df2.select('features', "type")

In [9]:
model = LogisticRegression(featuresCol='features', labelCol="type")

In [10]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [11]:
model = model.fit(train_data)

In [12]:
predict = model.transform(test_data)

In [13]:
predict.show()

+--------------------+----+--------------------+--------------------+----------+
|            features|type|       rawPrediction|         probability|prediction|
+--------------------+----+--------------------+--------------------+----------+
|[1.51571,12.72,3....|   1|[1.36533224363129...|[0.79662496039350...|       0.0|
|[1.51574,14.86,3....|   0|[2.15969526345010...|[0.89657129336781...|       0.0|
|[1.51589,12.88,3....|   1|[0.72398013404823...|[0.67348286685616...|       0.0|
|[1.5159,12.82,3.5...|   0|[2.09450057207828...|[0.89036751333745...|       0.0|
|[1.5159,13.24,3.3...|   0|[0.41924282696132...|[0.60330205081767...|       0.0|
|[1.51593,13.09,3....|   0|[0.31484918918539...|[0.57806844975788...|       0.0|
|[1.51645,13.4,3.4...|   0|[1.02896917071868...|[0.73671599927488...|       0.0|
|[1.51646,13.41,3....|   0|[-0.5721530761172...|[0.36074016235209...|       1.0|
|[1.51652,13.56,3....|   0|[0.52401257713743...|[0.62808556133854...|       0.0|
|[1.5166,12.99,3.1...|   0|[

In [13]:
def generateScore(prob):
    pdo = 20
    odds = None
    # if 1 not in prob:
    odds = prob/(1-prob)
    factor = pdo/np.log(2)
    offset = 500 - factor*np.log(pdo)

    score = offset + factor*np.log(odds)
    return score

def rank_order_df(developmentProb, monitoringProb, development_y, monitoring_y):
    developmentScore = generateScore(developmentProb)
    monitoringScore = generateScore(monitoringProb)
    temp1 = {'score': developmentScore.reshape(-1), 'y': development_y.reshape(-1)}
    development = pd.DataFrame(temp1)
    temp2 = {'score': monitoringScore.reshape(-1), 'y': monitoring_y.reshape(-1)}
    monitoring = pd.DataFrame(temp2)
    
    size = 11
    mini = np.min(developmentScore)
    maxi = np.max(monitoringScore)
    add = (maxi-mini)/size
    interval = [mini if (i == 0) else (mini := mini + add) for i in range(size)]    
    development_dr=[]
    rank_order_interval=[]
    rank_order_devlopment_dr=[]
    rank_order_monitoring_dr=[]
    for i in range(len(interval)-1):
        x= "["+str(round(interval[i]))+"-"+str(round(interval[i+1]))+")"
        rank_order_interval.append(x)
        # print(development["score"]>=interval[i])
        development_alert=development[(development["score"]>=interval[i]) & (development["score"]<interval[i+1])]["y"].values 
        if len(development_alert)==0:
            rank_order_devlopment_dr.append(0)
        else:
            rank_order_devlopment_dr.append(sum(development_alert)/len(development_alert))
        
        monitoring_alert=monitoring[(monitoring["score"]>=interval[i]) & (monitoring["score"]<interval[i+1])]["y"].values
        if len(monitoring_alert)==0:
            rank_order_monitoring_dr.append(0)
        else:
            rank_order_monitoring_dr.append(sum(monitoring_alert)/len(monitoring_alert))
    
    
    df=pd.DataFrame(list(zip(rank_order_interval,  rank_order_devlopment_dr, rank_order_monitoring_dr)),
              columns=["Interval","Development Default Rate","Monitoring Default Rate"])
    df["Interval"][0]="< "+(df.Interval[0])
    df["Interval"][size-2]=">="+df["Interval"][size-2].split("-")[0][1:]
    df= df.round(2)
    
    return df

def rank_order(development,monitoring,model,target):
  prob = udf(lambda v: float(v[1]), FloatType())
  monitoring_model = model.transform(monitoring)
  monitoringProb = monitoring_model.select(prob('probability'))
  development_model = model.transform(development)
  developmentProb = development_model.select(prob('probability'))
  development_y = np.array(development.select(target).collect())
  monitoring_y = np.array(monitoring.select(target).collect())
  df=rank_order_df(developmentProb.toPandas().values, monitoringProb.toPandas().values, development_y, monitoring_y)
  return df

In [14]:
rank_order(train_data, test_data,model,"type")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Interval"][0]="< "+(df.Interval[0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Interval"][size-2]=">="+df["Interval"][size-2].split("-")[0][1:]


Unnamed: 0,Interval,Development Default Rate,Monitoring Default Rate
0,< [152-182),0.0,0.0
1,[182-212),0.0,0.0
2,[212-242),0.0,0.0
3,[242-271),0.0,0.0
4,[271-301),0.0,0.0
5,[301-331),0.0,0.0
6,[331-361),0.0,0.0
7,[361-390),0.25,0.1
8,[390-420),0.52,0.31
9,>=420,0.62,0.7


In [15]:
rank_order(train_data, test_data,model,"type")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Interval"][0]="< "+(df.Interval[0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Interval"][size-2]=">="+df["Interval"][size-2].split("-")[0][1:]


Unnamed: 0,Interval,Development Default Rate,Monitoring Default Rate
0,< [152-182),0.0,0.0
1,[182-212),0.0,0.0
2,[212-242),0.0,0.0
3,[242-271),0.0,0.0
4,[271-301),0.0,0.0
5,[301-331),0.0,0.0
6,[331-361),0.0,0.0
7,[361-390),0.25,0.1
8,[390-420),0.52,0.31
9,>=420,0.62,0.7


In [18]:
def get_graph(df, user):
#     helper = Model_helper(username=user)
#     colors = helper.preference_maker(2)
    months =df['Interval']

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=months,
        y=df["Development Default Rate"],
        name='development',
        marker_color='rgba(0,0,0,255)'
    ))
    fig.add_trace(go.Bar(
        x=months,
        y=df["Monitoring Default Rate"],
        name='monitoring',
        marker_color='rgba(0,255,0,0)'
    ))
    fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'}, title={
            'text': '<b>'+"RANK-ORDER PLOT"+'<b>',
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
            }
        )
    fig.update_layout(
      
      )

    fig.update_layout(yaxis_range=[0,1])
    fig.update_layout(barmode='group', xaxis_tickangle=-45)
    fig.update_xaxes(title_text='<b>' + 'Scores range' + '<b>',showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(title_text='<b> Probabilities <b>',showline=True, linewidth=1, linecolor='black')
    return fig


def rankorder(d_df,m_df,model,target,user):
    prob = udf(lambda v: float(v[1]), FloatType())
    monitoring_model = model.transform(m_df)
    monitoringProb = monitoring_model.select(prob('probability'))
    development_model = model.transform(d_df)
    developmentProb = development_model.select(prob('probability'))
    development_y = np.array(d_df.select(target).collect())
    monitoring_y = np.array(m_df.select(target).collect())
    df = rank_order_df(developmentProb.toPandas().values, monitoringProb.toPandas().values, development_y, monitoring_y)
    fig=get_graph(df,user=user)
    return df.round(3),fig,m_df,d_df


In [19]:
rankorder(train_data, test_data,model,"type","AJAY")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Interval"][0]="< "+(df.Interval[0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Interval"][size-2]=">="+df["Interval"][size-2].split("-")[0][1:]


(      Interval  Development Default Rate  Monitoring Default Rate
 0  < [152-182)                      0.00                     0.00
 1    [182-212)                      0.00                     0.00
 2    [212-242)                      0.00                     0.00
 3    [242-271)                      0.00                     0.00
 4    [271-301)                      0.00                     0.00
 5    [301-331)                      0.00                     0.00
 6    [331-361)                      0.00                     0.00
 7    [361-390)                      0.25                     0.10
 8    [390-420)                      0.52                     0.31
 9        >=420                      0.62                     0.70,
 Figure({
     'data': [{'marker': {'color': 'rgba(0,0,0,255)'},
               'name': 'development',
               'type': 'bar',
               'x': array(['< [152-182)', '[182-212)', '[212-242)', '[242-271)', '[271-301)',
                           '[301-33

In [34]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pysparkling.ml import H2ODRFClassifier, H2OGLMClassifier, H2OXGBoostClassifier, H2ODeepLearningClassifier


In [35]:
class CurveMetrics(BinaryClassificationMetrics):
    def __init__(self, *args):
        super(CurveMetrics, self).__init__(*args)

    def _to_list(self, rdd):
        points = []
        # Note this collect could be inefficient for large datasets
        # considering there may be one probability per datapoint (at most)
        # The Scala version takes a numBins parameter,
        # but it doesn't seem possible to pass this from Python to Java
        for row in rdd.collect():
            # Results are returned as type scala.Tuple2,
            # which doesn't appear to have a py4j mapping
            points += [(float(row._1()), float(row._2()))]
        return points

    def get_curve(self, method):
        rdd = getattr(self._java_model, method)().toJavaRDD()
        return self._to_list(rdd)


In [45]:
def get_model(train_data, model_key, n_fold):
    model = None
    grid = ParamGridBuilder().build()
    evaluator = BinaryClassificationEvaluator()    
    if model_key == 1:
        model = RandomForestClassifier(labelCol='label', featuresCol="features", numTrees=10)
        crossval = CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, parallelism=2, numFolds=n_fold)
        model = crossval.fit(train_data)
    elif model_key == 2:
        model = LogisticRegression(labelCol='label', featuresCol="features", maxIter=100, regParam=0.001,elasticNetParam=1, standardization=True)
        crossval = CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, parallelism=2, numFolds=n_fold)
        model = crossval.fit(train_data)
    elif model_key == 3:
        model = LinearSVC(labelCol='label', featuresCol="features", maxIter=1000, regParam=0.1)
        crossval = CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, parallelism=2, numFolds=n_fold)
        model = crossval.fit(train_data)
    elif model_key == 4:
        model = H2ODRFClassifier(labelCol='label', featuresCols=["features"], nfolds=n_fold)
        model = model.fit(train_data)
    elif model_key == 5:
        model = H2OGLMClassifier(labelCol='label', featuresCols=["features"], nfolds=n_fold)
        model = model.fit(train_data)
    elif model_key == 6:
        model = H2OXGBoostClassifier(labelCol='label', featuresCols=["features"],nfolds=n_fold)
        model = model.fit(train_data)
    elif model_key == 7:
        model = H2ODeepLearningClassifier(labelCol='label', featuresCols=["features"], nfolds=n_fold)
        model = model.fit(train_data)
    return model


def main_kfold(data, names, metrics, target, n_fold):
    # helper = Model_helper(username=user)
    # colors = helper.preference_maker(6)
    data = data.withColumnRenamed(target, "label")
    models = []
    for i in names:
        key = None
        if i.lower() == 'RandomForestClassifier'.lower():
            key = 1
        elif i.lower() == 'LogisticRegression'.lower():
            key = 2
        elif i.lower() == 'LinearSVC'.lower():
            key = 3
        elif i.lower() == 'H2ODRFClassifier'.lower():
            key = 4
        elif i.lower() == 'H2OGLMClassifier'.lower():
            key = 5
        elif i.lower() == 'H2OXGBoostClassifier'.lower():
            key = 6
        elif i.lower() == 'H2ODeepLearningClassifier'.lower():
            key = 7

        t = get_model(data, key, n_fold)
        models.append(t)


    mean_acc = []
    mean_f1 = []
    mean_roc = []
    mean_prec = []
    mean_rec = []

    figures = []
    qw = 0
#     col = colors
#     col.append(['purple', 'violet', 'black', 'pink'])
    for model in models:
        m_name = names[qw]
        model = model.transform(data)
        if 'accuracy' in metrics:
            evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                          metricName="accuracy")
            accuracy = evaluator.evaluate(model)
            mean_acc.append(round(accuracy, 2))
        if 'f1' in metrics:
            evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
            f1 = evaluator.evaluate(model)
            mean_f1.append(round(f1, 2))
        if 'roc' in metrics:
            evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction")
            roc = evaluator.evaluate(model)
            mean_roc.append(round(roc, 2))
        if 'precision' in metrics:
            evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="weightedPrecision")
            weightedPrecision = evaluator.evaluate(model)
            mean_prec.append(round(weightedPrecision, 2))
        if 'recall' in metrics:
            evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="weightedRecall")
            weightedRecall = evaluator.evaluate(model)
            mean_rec.append(round(weightedRecall, 2))

        # Compute ROC curve and area the curve
        BCM = model.select("label", 'probability').rdd.map(lambda row: (float(row['probability'][1]), float(row['label'])))
        curveMetrics = CurveMetrics(BCM)
        roc_auc = curveMetrics.areaUnderROC
        points = curveMetrics.get_curve('roc')
        fpr = np.array([x[0] for x in points])
        tpr = np.array([x[1] for x in points])
        roc_auc = ' AUC Score= ' + str(round(roc_auc, 2))
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                                 mode='lines',
                                 name='No Skill',
                                 line=dict(color='black', width=0.5, dash='dash')))
        fig.add_trace(go.Scatter(x=fpr, y=tpr,
                                 mode='lines',
                                 name=roc_auc,
                                 line=dict(color="purple", width=1)))
        tit = 'ROC Curve for ' + m_name
        # fig.update_layout(yaxis_title="<b> True Positive Rate </b>", xaxis_title="<b> False Positive Rate </b>")
        fig.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)', 'paper_bgcolor': 'rgba(0,0,0,0)'})
        fig.update_xaxes(title_text='False Positive Rate', showline=True, linewidth=1, linecolor='black',
                         rangemode='nonnegative')
        fig.update_yaxes(title_text='True Positive Rate', showline=True, linewidth=1, linecolor='black',
                         rangemode='nonnegative')
        fig.update_layout(title={'text': tit,
                                 'y': 0.9,
                                 'x': 0.5,
                                 'xanchor': 'center',
                                 'yanchor': 'top'})
    
        figures.append(fig)
        qw = qw + 1
#     figure = tuple(figures)
    
    
    table = pd.DataFrame()
    table['Model Name'] = names
    if 'accuracy' in metrics:
        table['Mean Accuracy Score'] = mean_acc
    if 'f1' in metrics:
        table['Mean f1 Score'] = mean_f1
    if 'roc' in metrics:
        table['Mean AUCROC Score'] = mean_roc
    if 'precision' in metrics:
        table['Mean Precision Score'] = mean_prec
    if 'recall' in metrics:
        table['Mean Recall Score'] = mean_rec

    return table, figures


In [46]:
metrics = ["accuracy",'f1','roc','precision','recall']

In [49]:
train_data.show()

+--------------------+----+
|            features|type|
+--------------------+----+
|[1.51215,12.99,3....|   1|
|[1.51409,14.25,3....|   0|
|[1.51574,14.86,3....|   0|
|[1.51589,12.88,3....|   1|
|[1.5159,12.82,3.5...|   0|
|[1.5159,13.02,3.5...|   0|
|[1.51592,12.86,3....|   0|
|[1.51593,13.25,3....|   0|
|[1.51596,12.79,3....|   1|
|[1.51596,13.02,3....|   0|
|[1.51605,12.9,3.4...|   0|
|[1.51613,13.92,3....|   0|
|[1.51618,13.53,3....|   1|
|[1.51627,13.0,3.5...|   0|
|[1.51631,13.34,3....|   0|
|[1.51645,13.44,3....|   0|
|[1.51646,13.41,3....|   0|
|[1.51652,13.56,3....|   0|
|[1.51655,12.75,2....|   0|
|[1.5166,12.99,3.1...|   0|
+--------------------+----+
only showing top 20 rows



In [50]:
%%time
# tab,fig  = main_kfold(train_data,["RandomForestClassifier","LogisticRegression"],metrics,"type")
tab,fig = main_kfold(train_data,["H2ODeepLearningClassifier"],metrics,"type",5)

Py4JJavaError: An error occurred while calling o7683.fit.
: java.lang.RuntimeException: H2OContext needs to be created in order to train the model. Please create one as H2OContext.getOrCreate().
	at ai.h2o.sparkling.H2OContext$.$anonfun$ensure$1(H2OContext.scala:416)
	at scala.Option.getOrElse(Option.scala:189)
	at ai.h2o.sparkling.H2OContext$.ensure(H2OContext.scala:416)
	at ai.h2o.sparkling.ml.algos.H2OAlgoCommonUtils.prepareDatasetForFitting(H2OAlgoCommonUtils.scala:82)
	at ai.h2o.sparkling.ml.algos.H2OAlgoCommonUtils.prepareDatasetForFitting$(H2OAlgoCommonUtils.scala:55)
	at ai.h2o.sparkling.ml.algos.classification.H2ODeepLearningClassifier.ai$h2o$sparkling$ml$algos$classification$H2OClassifier$$super$prepareDatasetForFitting(H2ODeepLearningClassifier.scala:24)
	at ai.h2o.sparkling.ml.algos.classification.H2OClassifier.prepareDatasetForFitting(H2OClassifier.scala:35)
	at ai.h2o.sparkling.ml.algos.classification.H2OClassifier.prepareDatasetForFitting$(H2OClassifier.scala:34)
	at ai.h2o.sparkling.ml.algos.classification.H2ODeepLearningClassifier.ai$h2o$sparkling$ml$algos$classification$DistributionForClassificationCheck$$super$prepareDatasetForFitting(H2ODeepLearningClassifier.scala:24)
	at ai.h2o.sparkling.ml.algos.classification.DistributionForClassificationCheck.prepareDatasetForFitting(DistributionForClassificationCheck.scala:33)
	at ai.h2o.sparkling.ml.algos.classification.DistributionForClassificationCheck.prepareDatasetForFitting$(DistributionForClassificationCheck.scala:27)
	at ai.h2o.sparkling.ml.algos.classification.H2ODeepLearningClassifier.prepareDatasetForFitting(H2ODeepLearningClassifier.scala:24)
	at ai.h2o.sparkling.ml.algos.H2OAlgorithm.fit(H2OAlgorithm.scala:53)
	at ai.h2o.sparkling.ml.algos.H2OSupervisedAlgorithm.fit(H2OSupervisedAlgorithm.scala:57)
	at ai.h2o.sparkling.ml.algos.H2ODeepLearning.fit(H2ODeepLearning.scala:35)
	at ai.h2o.sparkling.ml.algos.H2ODeepLearning.fit(H2ODeepLearning.scala:27)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [26]:
pyo.plot(fig[0])

'temp-plot.html'

In [24]:
import numpy as np
import pandas as pd
import plotly as py
import plotly.tools as tls
import plotly.offline as pyo
import plotly.graph_objs as go

In [88]:
pyo.plot(fig[0])

'temp-plot.html'

In [80]:
tab

Unnamed: 0,Model Name,Mean Accuracy Score,Mean f1 Score,Mean AUCROC Score,Mean Precision Score,Mean Recall Score
0,RandomForestClassifier,0.97,0.97,0.97,0.97,0.97
1,LogisticRegression,0.7,0.7,0.7,0.7,0.7


In [126]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
lr = LogisticRegression()

grid = ParamGridBuilder().build()

evaluator = BinaryClassificationEvaluator()

cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,parallelism=2)

In [127]:
final_data = final_data.withColumnRenamed('type', 'label')
final_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.52101,13.64,4....|    1|
|[1.51761,13.89,3....|    1|
|[1.51618,13.53,3....|    1|
|[1.51766,13.21,3....|    1|
|[1.51742,13.27,3....|    1|
|[1.51596,12.79,3....|    1|
|[1.51743,13.3,3.6...|    1|
|[1.51756,13.15,3....|    1|
|[1.51918,14.04,3....|    1|
|[1.51755,13.0,3.6...|    1|
|[1.51571,12.72,3....|    1|
|[1.51763,12.8,3.6...|    1|
|[1.51589,12.88,3....|    1|
|[1.51748,12.86,3....|    1|
|[1.51763,12.61,3....|    1|
|[1.51761,12.81,3....|    1|
|[1.51784,12.68,3....|    1|
|[1.52196,14.36,3....|    1|
|[1.51911,13.9,3.7...|    1|
|[1.51735,13.02,3....|    1|
+--------------------+-----+
only showing top 20 rows



In [140]:
lr = LogisticRegression(featuresCol='features', labelCol='label')

grid = ParamGridBuilder().build()

evaluator = BinaryClassificationEvaluator()

cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,

    parallelism=2, numFolds=5)

In [141]:
model_demo = cv.fit(final_data)

In [143]:
model_demo.bestModel.summary.roc.show()


+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|                 0.0|0.014285714285714285|
|                 0.0| 0.04285714285714286|
|                 0.0| 0.05714285714285714|
|                 0.0| 0.07142857142857142|
|                 0.0| 0.08571428571428572|
|                 0.0|                 0.1|
|                 0.0| 0.11428571428571428|
|                 0.0| 0.12857142857142856|
|                 0.0| 0.14285714285714285|
|                 0.0| 0.15714285714285714|
|                 0.0| 0.17142857142857143|
|                 0.0| 0.18571428571428572|
|                 0.0|                 0.2|
|                 0.0| 0.21428571428571427|
|0.013157894736842105| 0.21428571428571427|
|0.013157894736842105| 0.22857142857142856|
| 0.02631578947368421| 0.22857142857142856|
| 0.02631578947368421| 0.24285714285714285|
| 0.02631578947368421|  0.257142

In [136]:
rfc = RandomForestClassifier(labelCol='label', featuresCol="features", numTrees=10)
grid = ParamGridBuilder().build()

evaluator = BinaryClassificationEvaluator()

cv = CrossValidator(estimator=rfc, estimatorParamMaps=grid, evaluator=evaluator,

    parallelism=2, numFolds=5)

In [137]:
model_demo = cv.fit(final_data)

In [139]:
model_demo.summary.roc.show()


AttributeError: 'CrossValidatorModel' object has no attribute 'summary'

In [None]:
model_demo.transform

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator  # P,R,F-1,TPR,FPR

evalute_mllib(model_demo,final_data)

AttributeError: 'float' object has no attribute 'round'

In [41]:
evaluator.evaluate(model_demo.transform(final_data))

0.99

In [38]:
def evalute_mllib(model, df):
    predictions = model.transform(df)
    
    evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    
    evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
    f1 = evaluator.evaluate(predictions)
    
    evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
    weightedPrecision = evaluator.evaluate(predictions)
    
    evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
    weightedRecall = evaluator.evaluate(predictions)
    
    return {"accuracy":accuracy.round(2),"f1-score":f1,"precision":weightedPrecision,"recall":weightedRecall}

In [36]:
evalute_mllib(model_demo,final_data)

{'accuracy': 0.9726027397260274,
 'f1-score': 0.9726027397260274,
 'precision': 0.9736842105263158,
 'recall': 0.9736842105263158}

In [None]:
rankorder