# 3. Feature Engineering Pipeline Modules #
## ( reused from previous step )##

In [1]:
import pandas as pd
import numpy as np
# df = pd.read_csv('\Coursera\Advanced_DS_Capstone_02.csv', index_col=[0])
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header = None)
df.columns = ["Feature_1", "Feature_2", "Feature_3", "Feature_4", "Label"]

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20190904164903-0000
KERNEL_ID = 80ba139d-1e5b-4652-9998-2caa2619c049


In [3]:
from pyspark.ml.feature import StringIndexer
spark_df = sqlContext.createDataFrame(df)
indexer = StringIndexer(inputCol="Label", outputCol="LabelIndex")

from pyspark.ml.feature import OneHotEncoderEstimator
encoder = OneHotEncoderEstimator(inputCols=["LabelIndex"], outputCols=["LabelVec"])

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=["Feature_1","Feature_2","Feature_3", "Feature_4"],
                                  outputCol="FeaturesVec")

from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="FeaturesVec", outputCol="FeaturesNorm", p=1.0)

# 4. Model Definition Pipelines #
## 4.1 Multilayer-Perceptron-Classifier ##

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(labelCol="LabelIndex", featuresCol="FeaturesNorm", maxIter=100, layers=[4, 3, 3], blockSize=1, seed=123)
pipelineMLP = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer, mlp])

## 4.2 Decision-Tree-Classifier ##

In [5]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="LabelIndex", featuresCol="FeaturesNorm")
pipelineDT = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer, dt])

## 4.3 Random-Forest-Classifier ##

In [6]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="LabelIndex", featuresCol="FeaturesNorm", numTrees=10)
pipelineRF = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer, rf])

# 5. Model Training and Evaluation #
## 5.1 Using Splitting in Train and Test Data ##
### 5.1.1 Randomly Splitting the Data ###

In [46]:
splits = spark_df.randomSplit([0.85, 0.15])
df_train = splits[0]
df_test = splits[1]

### 5.1.2 Normal Training on df_train###

In [47]:
modelMLP = pipelineMLP.fit(df_train)
modelDT = pipelineDT.fit(df_train)
modelRF = pipelineRF.fit(df_train)

### 5.1.3 Evaluation on df_train and df_test ###

In [48]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
multiEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("LabelIndex")
results = pd.DataFrame(columns = ['MultiLayerPerceptron', 'DecisionTree', 'RandomForest'], index =['normal_df_train', 'normal_df_test', 'crossVal_df_train', 'crossVal_df_test']) 
prediction = modelMLP.transform(df_train)
results.iloc[0,0] = multiEval.evaluate(prediction)
prediction = modelDT.transform(df_train)
results.iloc[0,1] = multiEval.evaluate(prediction)
prediction = modelRF.transform(df_train)
results.iloc[0,2] = multiEval.evaluate(prediction)

prediction = modelMLP.transform(df_test)
results.iloc[1,0] = multiEval.evaluate(prediction)
prediction = modelDT.transform(df_test)
results.iloc[1,1] = multiEval.evaluate(prediction)
prediction = modelRF.transform(df_test)
results.iloc[1,2] = multiEval.evaluate(prediction)

Unnamed: 0,MultiLayerPerceptron,DecisionTree,RandomForest
normal_df_train,0.976744,1.0,1.0
normal_df_test,0.952381,0.904762,0.952381
crossVal_df_train,,,
crossVal_df_test,,,


## 5.2 Using CrossValidation ##
### 5.2.1 Setting up CrossValidators - using Model Pipelines ###

In [7]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramGrid = ParamGridBuilder().build()

crossvalMLP = CrossValidator(estimator=pipelineMLP, estimatorParamMaps=paramGrid, evaluator=multiEval, numFolds=5)
crossvalDT = CrossValidator(estimator=pipelineDT, estimatorParamMaps=paramGrid, evaluator=multiEval, numFolds=5)
crossvalRF = CrossValidator(estimator=pipelineRF, estimatorParamMaps=paramGrid, evaluator=multiEval, numFolds=5)

### 5.2.2 Training ###

In [49]:
cvModelMLP = crossvalMLP.fit(df_train)
cvModelDT = crossvalDT.fit(df_train)
cvModelRF = crossvalRF.fit(df_train)

### 5.2.3 Evaluation ###

In [50]:
prediction = cvModelMLP.transform(df_train)
results.iloc[2,0] = multiEval.evaluate(prediction)
prediction = cvModelDT.transform(df_train)
results.iloc[2,1] = multiEval.evaluate(prediction)
prediction = cvModelRF.transform(df_train)
results.iloc[2,2] = multiEval.evaluate(prediction)

prediction = cvModelMLP.transform(df_test)
results.iloc[3,0] = multiEval.evaluate(prediction)
prediction = cvModelDT.transform(df_test)
results.iloc[3,1] = multiEval.evaluate(prediction)
prediction = cvModelRF.transform(df_test)
results.iloc[3,2] = multiEval.evaluate(prediction)
results

Unnamed: 0,MultiLayerPerceptron,DecisionTree,RandomForest
normal_df_train,0.976744,1.0,1.0
normal_df_test,0.952381,0.904762,0.952381
crossVal_df_train,0.976744,1.0,1.0
crossVal_df_test,0.952381,0.904762,0.952381


# 6. Model improvement #
Lets try to improve the Multilayer Perceptron Model
## 6.1 Model improvement using Hyperparameter Tuning ##

### 6.1.1 Hyperparameter Tuning Pipeline ###

In [61]:
resultsTuned = pd.DataFrame(columns = ['TrainingData', 'TestData'], index =['Baseline', 'HyperParamTuning', 'FeatureEngineering1', 'FeatureEngineering2']) 
resultsTuned.iloc[0,0] = results.iloc[0,0]
resultsTuned.iloc[0,1] = results.iloc[1,0]
paramGrid = ParamGridBuilder() \
    .addGrid(mlp.layers, [[4, 1, 3], [4, 2, 3], [4, 3, 3], [4, 4, 3], [4, 6, 3]]) \
    .addGrid(mlp.maxIter, [50, 100, 150]) \
    .build()
#  .addGrid(normalizer.p, [1.0, 2.0, 10.0]) \
crossvalMLP = CrossValidator(estimator=pipelineMLP, estimatorParamMaps=paramGrid, evaluator=multiEval, numFolds=5)

### 6.1.2 Hyperparameter Tuning ###

In [62]:
cvModelMLP = crossvalMLP.fit(df_train)
prediction = cvModelMLP.transform(df_train)
resultsTuned.iloc[1,0] = multiEval.evaluate(prediction)

### 6.1.3 Hyperparameter Tuning Validation ###

In [63]:
prediction = cvModelMLP.transform(df_test)
resultsTuned.iloc[1,1] = multiEval.evaluate(prediction)

Unnamed: 0,TrainingData,TestData
Baseline,0.976744,0.952381
HyperParamTuning,0.976744,0.952381
FeatureEngineering1,,
FeatureEngineering2,,


## 6.2 Model improvement using PCA transformed Features and Hyperparameter Tuning ##
### 6.2.1 Alternative Feature Engineering Pipeline using PCA ###

In [64]:
from pyspark.ml.feature import PCA
pca = PCA(k=3, inputCol="FeaturesNorm", outputCol="FeaturesPCA")
mlpPCA = MultilayerPerceptronClassifier(labelCol="LabelIndex", featuresCol="FeaturesPCA", maxIter=100, layers=[3, 3, 3], blockSize=1, seed=123)
pipelinePCAMLP = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer, pca, mlpPCA])

paramGrid = ParamGridBuilder() \
    .addGrid(mlp.layers, [[3, 1, 3], [3, 2, 3], [3, 3, 3]]) \
    .addGrid(mlp.maxIter, [50, 100, 150]) \
    .build()

#  .addGrid(normalizer.p, [1.0, 2.0, 10.0]) \
crossvalPCAMLP = CrossValidator(estimator=pipelinePCAMLP, estimatorParamMaps=paramGrid, evaluator=multiEval, numFolds=5)

cvModelPCAMLP = crossvalPCAMLP.fit(df_train)
prediction = cvModelPCAMLP.transform(df_train)
resultsTuned.iloc[2,0] = multiEval.evaluate(prediction)

prediction = cvModelPCAMLP.transform(df_test)
resultsTuned.iloc[2,1] = multiEval.evaluate(prediction)

Unnamed: 0,TrainingData,TestData
Baseline,0.976744,0.952381
HyperParamTuning,0.976744,0.952381
FeatureEngineering1,0.976744,0.952381
FeatureEngineering2,,


### 6.2.2 Alternative Feature Engineering Pipeline using Scaling ###

In [65]:
pipelineScaledMLP = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer, mlp])
paramGrid = ParamGridBuilder() \
    .addGrid(mlp.maxIter, [50, 100, 150]) \
    .addGrid(normalizer.p, [1.0, 2.0, 10.0]) \
    .build()

cvScaledMLP = CrossValidator(estimator=pipelineScaledMLP, estimatorParamMaps=paramGrid, evaluator=multiEval, numFolds=5)

cvScaledMLPModell = cvScaledMLP.fit(df_train)
prediction = cvScaledMLPModell.transform(df_train)
resultsTuned.iloc[3,0] = multiEval.evaluate(prediction)

prediction = cvScaledMLPModell.transform(df_test)
resultsTuned.iloc[3,1] = multiEval.evaluate(prediction)
resultsTuned

Unnamed: 0,TrainingData,TestData
Baseline,0.976744,0.952381
HyperParamTuning,0.976744,0.952381
FeatureEngineering1,0.976744,0.952381
FeatureEngineering2,0.976744,0.952381


# 7. Summary #

In 5.2.3 Evaluation I got exactly the same results regarding the accuracy of 3 different algorithms for both: 
    (1) classic training 
    (2) crossvalidation training
As the evaluation on before unseen data shows, using both training methods resulted in overfitting. This overfitting wasnt observable by using crossvalidation and was detectable only by an evaluation on before unseen data.
Hyperparameter tuning and two feature engineering modifications did unfortunately not result an improved multilayerperceptron model. Beside of that the result are still very good.


# 8. Conclusion#
Conclusion:
Whenever possible the evaluation should not be based on crossvalidation only, instead evaluation should be done on before unseen data whenever possible.