In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

#from user_definition import *

sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

def toDoubleSafe(v):
    try:
        return float(v)
    except:
        return str(v) #if it is not a float type return as a string.

# def strip_time(x):
#     x = x.strip("\"")
#     try:
        
#         return datetime.strptime(x,'%Y-%m-%d %H:%M:%S')
#     except:
#         return None

22/03/06 16:55:57 WARN Utils: Your hostname, Fans-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.0.0.115 instead (on interface en0)
22/03/06 16:55:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/06 16:55:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Create dataframe

In [2]:
#Create a DataFrame
from pyspark.sql.types import *

penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = ss.read.csv("../Data/penbased.dat", samplingRatio=0.3, schema=penschema)

In [3]:
dfpen.show()

                                                                                

+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| pix1| pix2|pix3| pix4| pix5| pix6| pix7| pix8| pix9|pix10|pix11|pix12|pix13|pix14|pix15|pix16|label|
+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 47.0|100.0|27.0| 81.0| 57.0| 37.0| 26.0|  0.0|  0.0| 23.0| 56.0| 53.0|100.0| 90.0| 40.0| 98.0|  8.0|
|  0.0| 89.0|27.0|100.0| 42.0| 75.0| 29.0| 45.0| 15.0| 15.0| 37.0|  0.0| 69.0|  2.0|100.0|  6.0|  2.0|
|  0.0| 57.0|31.0| 68.0| 72.0| 90.0|100.0|100.0| 76.0| 75.0| 50.0| 51.0| 28.0| 25.0| 16.0|  0.0|  1.0|
|  0.0|100.0| 7.0| 92.0|  5.0| 68.0| 19.0| 45.0| 86.0| 34.0|100.0| 45.0| 74.0| 23.0| 67.0|  0.0|  4.0|
|  0.0| 67.0|49.0| 83.0|100.0|100.0| 81.0| 80.0| 60.0| 60.0| 40.0| 40.0| 33.0| 20.0| 47.0|  0.0|  1.0|
|100.0|100.0|88.0| 99.0| 49.0| 74.0| 17.0| 47.0|  0.0| 16.0| 37.0|  0.0| 73.0| 16.0| 20.0| 20.0|  6.0|
|  0.0|100.0| 3.0| 72.0| 26.0| 35.0| 85.0| 35.0|100.0| 71.0| 73.0| 97.0| 

## Create dataframe with a feature vector and label

In [4]:
def Vector_Assembler(df,y_column):
    columns = df.columns
    # remove y column
    columns.remove(y_column)
    va= VectorAssembler(inputCols=columns,outputCol='features').transform(df)
    lpoints = va.select("features", y_column).withColumnRenamed(y_column, "label")
    return lpoints

In [5]:
lpoints = Vector_Assembler(dfpen,'label')

In [6]:
lpoints.show(5)

22/03/06 16:58:15 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[47.0,100.0,27.0,...|  8.0|
|[0.0,89.0,27.0,10...|  2.0|
|[0.0,57.0,31.0,68...|  1.0|
|[0.0,100.0,7.0,92...|  4.0|
|[0.0,67.0,49.0,83...|  1.0|
+--------------------+-----+
only showing top 5 rows



In [None]:
# # Merging the data with Vector Assembler.
# from pyspark.ml.feature import VectorAssembler
# va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1]) #except the last col.

## Split dataframe into training and test sets

In [7]:
#Divide the dataset into training and vaildation sets.
splits = lpoints.randomSplit([.8,.2])
pendttrain = splits[0].cache()
pendtvalid = splits[1].cache()

In [8]:
# # Create Training and Test data.
# pendtsets = penlpoints.randomSplit([0.8, 0.2])
# pendttrain = pendtsets[0].cache()
# pendtvalid = pendtsets[1].cache()

## Create a RandomForestClassifer and build a model using training Dataset

In [13]:
# Train the model.
from pyspark.ml.classification import RandomForestClassifier
model = RandomForestClassifier(maxDepth=20)
rfmodel = rf.fit(pendttrain)

22/03/06 17:03:32 WARN DAGScheduler: Broadcasting large task binary with size 1420.2 KiB
22/03/06 17:03:32 WARN DAGScheduler: Broadcasting large task binary with size 1867.9 KiB
22/03/06 17:03:33 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
22/03/06 17:03:33 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
22/03/06 17:03:33 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
22/03/06 17:03:34 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
22/03/06 17:03:34 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
22/03/06 17:03:34 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
22/03/06 17:03:34 WARN DAGScheduler: Broadcasting large task binary with size 1955.8 KiB
22/03/06 17:03:34 WARN DAGScheduler: Broadcasting large task binary with size 1524.9 KiB


## Evaluate the model

In [14]:
rfpredicts = rfmodel.transform(pendtvalid)

In [15]:
# expects two input columns: prediction and label.

# f1|accuracy(defulat)|weightedPrecision|weightedRecall|weightedTruePositiveRate| 
# weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| 
# falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| 
# logLoss|hammingLoss
metric_name = "f1"

evaluator = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")\
                .setMetricName(metric_name) 

evaluator.evaluate(rfpredicts)

22/03/06 17:03:39 WARN DAGScheduler: Broadcasting large task binary with size 1989.7 KiB


0.9872581027730427

In [16]:
from pyspark.mllib.evaluation import MulticlassMetrics

#prediction and label
prediction_label = rfpredicts.select("prediction", "label").rdd

metrics = MulticlassMetrics(prediction_label)

confusionMetrics = metrics.confusionMatrix()

print("Confusion Metrics = \n%s" % confusionMetrics)

22/03/06 17:03:52 WARN DAGScheduler: Broadcasting large task binary with size 1980.6 KiB
22/03/06 17:03:53 WARN DAGScheduler: Broadcasting large task binary with size 1991.8 KiB


Confusion Metrics = 
DenseMatrix([[201.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.],
             [  0., 211.,  11.,   0.,   1.,   0.,   0.,   0.,   0.,   1.],
             [  0.,   1., 212.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
             [  0.,   0.,   1., 184.,   1.,   1.,   0.,   0.,   0.,   0.],
             [  1.,   0.,   0.,   0., 225.,   0.,   0.,   0.,   0.,   0.],
             [  0.,   0.,   0.,   2.,   0., 195.,   0.,   0.,   0.,   1.],
             [  0.,   0.,   0.,   0.,   0.,   0., 187.,   0.,   0.,   0.],
             [  0.,   1.,   0.,   0.,   0.,   1.,   0., 205.,   0.,   0.],
             [  0.,   0.,   0.,   0.,   0.,   1.,   0.,   1., 181.,   0.],
             [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 216.]])


In [11]:
from pyspark.mllib.evaluation import MulticlassMetrics

#prediction and label
prediction_label = rfpredicts.select("prediction", "label").rdd

metrics = MulticlassMetrics(prediction_label)

precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
confusionMetrics = metrics.confusionMatrix()

print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)
print("Confusion Metrics = \n%s" % confusionMetrics)

                                                                                

TypeError: precision() missing 1 required positional argument: 'label'