In [39]:
from pyspark import SparkContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder

#from user_definition import *

sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

def toDoubleSafe(v):
    try:
        return float(v)
    except:
        return str(v) #if it is not a float type return as a string.

# def strip_time(x):
#     x = x.strip("\"")
#     try:
        
#         return datetime.strptime(x,'%Y-%m-%d %H:%M:%S')
#     except:
#         return None

## Create Data Frame

In [5]:
pen_raw = sc.textFile("../Data/penbased.dat")
pen_raw.first(),

                                                                                

('47.0, 100.0, 27.0, 81.0, 57.0, 37.0, 26.0, 0.0, 0.0, 23.0, 56.0, 53.0, 100.0, 90.0, 40.0, 98.0, 8.0',)

In [6]:
train_rdd= pen_raw.map(lambda x: x.split(','))\
              .map(lambda x: [toDoubleSafe(i) for i in x])

In [7]:
train_rdd.first()

[47.0,
 100.0,
 27.0,
 81.0,
 57.0,
 37.0,
 26.0,
 0.0,
 0.0,
 23.0,
 56.0,
 53.0,
 100.0,
 90.0,
 40.0,
 98.0,
 8.0]

In [8]:
penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

In [9]:
dfpen = ss.createDataFrame(train_rdd, penschema)
dfpen.show(5)

+----+-----+----+-----+-----+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+
|pix1| pix2|pix3| pix4| pix5| pix6| pix7| pix8|pix9|pix10|pix11|pix12|pix13|pix14|pix15|pix16|label|
+----+-----+----+-----+-----+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+
|47.0|100.0|27.0| 81.0| 57.0| 37.0| 26.0|  0.0| 0.0| 23.0| 56.0| 53.0|100.0| 90.0| 40.0| 98.0|  8.0|
| 0.0| 89.0|27.0|100.0| 42.0| 75.0| 29.0| 45.0|15.0| 15.0| 37.0|  0.0| 69.0|  2.0|100.0|  6.0|  2.0|
| 0.0| 57.0|31.0| 68.0| 72.0| 90.0|100.0|100.0|76.0| 75.0| 50.0| 51.0| 28.0| 25.0| 16.0|  0.0|  1.0|
| 0.0|100.0| 7.0| 92.0|  5.0| 68.0| 19.0| 45.0|86.0| 34.0|100.0| 45.0| 74.0| 23.0| 67.0|  0.0|  4.0|
| 0.0| 67.0|49.0| 83.0|100.0|100.0| 81.0| 80.0|60.0| 60.0| 40.0| 40.0| 33.0| 20.0| 47.0|  0.0|  1.0|
+----+-----+----+-----+-----+-----+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+
only showing top 5 rows



In [8]:
# #Load the data and create an RDD (16 pixels and label)
# pen_raw = sc.textFile("../Data/penbased.dat", 4)

# pen_raw= pen_raw.map(lambda x: x.split(', ')).map(lambda x: [float(i) for i in x])

In [9]:
# #Create a DataFrame
# from pyspark.sql.types import *
# from pyspark.sql import Row
# penschema = StructType([
#     StructField("pix1",DoubleType(),True),
#     StructField("pix2",DoubleType(),True),
#     StructField("pix3",DoubleType(),True),
#     StructField("pix4",DoubleType(),True),
#     StructField("pix5",DoubleType(),True),
#     StructField("pix6",DoubleType(),True),
#     StructField("pix7",DoubleType(),True),
#     StructField("pix8",DoubleType(),True),
#     StructField("pix9",DoubleType(),True),
#     StructField("pix10",DoubleType(),True),
#     StructField("pix11",DoubleType(),True),
#     StructField("pix12",DoubleType(),True),
#     StructField("pix13",DoubleType(),True),
#     StructField("pix14",DoubleType(),True),
#     StructField("pix15",DoubleType(),True),
#     StructField("pix16",DoubleType(),True),
#     StructField("label",DoubleType(),True)
# ])

# dfpen = ss.createDataFrame(pen_raw,penschema)

In [10]:
dfpen.show()

+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| pix1| pix2|pix3| pix4| pix5| pix6| pix7| pix8| pix9|pix10|pix11|pix12|pix13|pix14|pix15|pix16|label|
+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 47.0|100.0|27.0| 81.0| 57.0| 37.0| 26.0|  0.0|  0.0| 23.0| 56.0| 53.0|100.0| 90.0| 40.0| 98.0|  8.0|
|  0.0| 89.0|27.0|100.0| 42.0| 75.0| 29.0| 45.0| 15.0| 15.0| 37.0|  0.0| 69.0|  2.0|100.0|  6.0|  2.0|
|  0.0| 57.0|31.0| 68.0| 72.0| 90.0|100.0|100.0| 76.0| 75.0| 50.0| 51.0| 28.0| 25.0| 16.0|  0.0|  1.0|
|  0.0|100.0| 7.0| 92.0|  5.0| 68.0| 19.0| 45.0| 86.0| 34.0|100.0| 45.0| 74.0| 23.0| 67.0|  0.0|  4.0|
|  0.0| 67.0|49.0| 83.0|100.0|100.0| 81.0| 80.0| 60.0| 60.0| 40.0| 40.0| 33.0| 20.0| 47.0|  0.0|  1.0|
|100.0|100.0|88.0| 99.0| 49.0| 74.0| 17.0| 47.0|  0.0| 16.0| 37.0|  0.0| 73.0| 16.0| 20.0| 20.0|  6.0|
|  0.0|100.0| 3.0| 72.0| 26.0| 35.0| 85.0| 35.0|100.0| 71.0| 73.0| 97.0| 

## Create a data frame includes "feature" and "label"

In [10]:
def Vector_Assembler(df,y_column):
    columns = df.columns
    # remove y column
    columns.remove(y_column)
    va= VectorAssembler(inputCols=columns,outputCol='features').transform(df)
    lpoints = va.select("features", y_column).withColumnRenamed(y_column, "label")
    return lpoints

In [11]:
lpoints = Vector_Assembler(dfpen,'label')
lpoints.show(5)

22/03/06 14:38:03 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[47.0,100.0,27.0,...|  8.0|
|[0.0,89.0,27.0,10...|  2.0|
|[0.0,57.0,31.0,68...|  1.0|
|[0.0,100.0,7.0,92...|  4.0|
|[0.0,67.0,49.0,83...|  1.0|
+--------------------+-----+
only showing top 5 rows



In [11]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler

In [13]:
# va = VectorAssembler(outputCol='features')
# va.setInputCols(dfpen.columns[:-1])
# va.transform()

## Create Training and Test data.

In [14]:
#Divide the dataset into training and vaildation sets.
pendtsets = lpoints.randomSplit([.8,.2])
train = pendtsets[0].cache()
val = pendtsets[1].cache()

In [15]:
# # Create Training and Test data.
# pendtsets = penlpoints.




## Train the decision tree model

In [19]:
# Train the data.
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(maxBins=32,maxDepth=5,minInstancesPerNode=1,minInfoGain=0)
dtmodel = dt.fit(train)


In [20]:
dtmodel.featureImportances

SparseVector(16, {0: 0.0238, 1: 0.0649, 2: 0.0038, 3: 0.0882, 4: 0.1004, 7: 0.0024, 8: 0.1029, 9: 0.1181, 10: 0.1055, 11: 0.0046, 12: 0.0304, 13: 0.1288, 14: 0.1084, 15: 0.1179})

In [18]:
print(dtmodel._call_java('toDebugString'))

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3f60c1742fea, depth=5, numNodes=59, numClasses=10, numFeatures=16
  If (feature 13 <= 57.5)
   If (feature 4 <= 40.5)
    If (feature 9 <= 20.5)
     If (feature 14 <= 62.5)
      If (feature 10 <= 12.5)
       Predict: 8.0
      Else (feature 10 > 12.5)
       Predict: 6.0
     Else (feature 14 > 62.5)
      If (feature 15 <= 26.5)
       Predict: 2.0
      Else (feature 15 > 26.5)
       Predict: 8.0
    Else (feature 9 > 20.5)
     If (feature 1 <= 99.5)
      If (feature 9 <= 62.5)
       Predict: 5.0
      Else (feature 9 > 62.5)
       Predict: 9.0
     Else (feature 1 > 99.5)
      If (feature 13 <= 16.5)
       Predict: 5.0
      Else (feature 13 > 16.5)
       Predict: 4.0
   Else (feature 4 > 40.5)
    If (feature 15 <= 26.5)
     If (feature 10 <= 37.5)
      If (feature 12 <= 37.5)
       Predict: 1.0
      Else (feature 12 > 37.5)
       Predict: 2.0
     Else (feature 10 > 37.5)
      If (feature 3 <= 90.5)
     

## Test the model

In [25]:
#Test data.
dtpredicts = dtmodel.transform(val)
dtpredicts.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,0.0,27.0,24....|  1.0|[3.0,492.0,0.0,3....|[0.00373134328358...|       1.0|
|[0.0,0.0,33.0,12....|  9.0|[6.0,0.0,0.0,0.0,...|[0.375,0.0,0.0,0....|       9.0|
|[0.0,0.0,38.0,13....|  9.0|[6.0,0.0,0.0,0.0,...|[0.375,0.0,0.0,0....|       9.0|
|[0.0,0.0,44.0,16....|  9.0|[6.0,0.0,0.0,0.0,...|[0.375,0.0,0.0,0....|       9.0|
|[0.0,16.0,33.0,43...|  1.0|[3.0,492.0,0.0,3....|[0.00373134328358...|       1.0|
|[0.0,21.0,41.0,46...|  1.0|[3.0,492.0,0.0,3....|[0.00373134328358...|       1.0|
|[0.0,22.0,36.0,47...|  1.0|[3.0,492.0,0.0,3....|[0.00373134328358...|       1.0|
|[0.0,27.0,33.0,50...|  1.0|[3.0,492.0,0.0,3....|[0.00373134328358...|       1.0|
|[0.0,28.0,22.0,49...|  1.0|[3.0,492.0,0.0,3....|[0.00373134328358...|       1.0|
|[0.0,33.0,31.0,

## Evaluate the model
available metrics : https://spark.apache.org/docs/latest/mllib-evaluation-metrics.html

In [40]:
# expects two input columns: prediction and label.

# f1|accuracy(defulat)|weightedPrecision|weightedRecall|weightedTruePositiveRate| 
# weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| 
# falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| 
# logLoss|hammingLoss
metric_name = "f1"

metrics = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")\
                .setMetricName(metric_name) 

metrics.evaluate(dtpredicts)

0.81892688077048

## N-fold validation
cross-validation : https://spark.apache.org/docs/latest/ml-tuning.html#cross-validation


In [50]:
# build model
dtree = DecisionTreeClassifier()

In [51]:
metric_name = "f1"

evaluator = MulticlassClassificationEvaluator()\
                .setLabelCol("label")\
                .setPredictionCol("prediction")\
                .setMetricName(metric_name) 
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder()\
            .addGrid(dtree.maxDepth,[5,10,15,20,25,30])\
            .build()

cv = CrossValidator(estimator=dtree,
                   evaluator=evaluator,
                   numFolds=5,
                   estimatorParamMaps=paramGrid)

In [52]:
cvmodel =cv.fit(train)

22/03/06 16:01:53 WARN CacheManager: Asked to cache already cached data.
22/03/06 16:01:53 WARN CacheManager: Asked to cache already cached data.


In [61]:
cv_val_pred = cvmodel.bestModel.transform(val)
print("Best Max Depth : %s" % cvmodel.bestModel.getMaxDepth())
print("Accuracy : %s" % evaluator.evaluate(cv_val_pred))

Best Max Depth : 15
Accuracy : 0.9592964824120603


In [62]:
# n-fold validation and the results.
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = 
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder().
cv = CrossValidator()

cvmodel = cv.
print("Best Max Depth : %s" % cvmodel.bestModel._java_obj.getMaxDepth())
print("Accuracy : %s" % MulticlassClassificationEvaluator().evaluate(cvmodel.bestModel.transform(pendtvalid)))

SyntaxError: invalid syntax (922163746.py, line 6)

In [63]:
sc.stop()