<a href="https://colab.research.google.com/github/veroorli/ProjetProg/blob/master/TME622.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* Master DAC, BDLE, 2022
* Author: Mohamed-Amine Baazizi
* Affiliation: LIP6 - Faculté des Sciences - Sorbonne Université
* Email: mohamed-amine.baazizi@lip6.fr

# Spark Setup (with Deequ enabled)

## vectors

## DT data loading

In [None]:
schema = 'age string, income string, student string, credit_rating string, label string'
data = spark.sparkContext.parallelize(tuples).toDF(schema)
data.printSchema()
data.show()

## Transformations

### String indexer

In [None]:
field = 'age'
age_indexer = StringIndexer(inputCol=field,outputCol='indexed_'+field)
df_age_idx = age_indexer.fit(data).transform(data)
df_age_idx.show()


In [None]:
def string_index_cols(cols,prefix):
  outCols = map(lambda c:prefix+c, cols)
  # return list(outCols)
  return StringIndexer(inputCols=cols,outputCols=list(outCols))
  

# si = index_cols(['age','income'])
# si.getOutputCols()

In [None]:
prefix = 'indexed_'
fields = ['age','income']
age_income_indexer = string_index_cols(fields,prefix)
df_age_income_idx = age_income_indexer.fit(data).transform(data)
df_age_income_idx.show()

### IndexToString

In [None]:
age_rev_indexer = IndexToString(inputCol=age_indexer.getOutputCol(),outputCol='original_age')

df_orig_age =age_rev_indexer.transform(df_age_idx)
df_orig_age.show()


### one-hot encoder

In [None]:
age_onehotenc = OneHotEncoder(inputCol=age_indexer.getOutputCol(),outputCol='cat_age')
age_onehotenc.setDropLast(False)
df_age_onehot = age_onehotenc.fit(df_age_idx).transform(df_age_idx)
df_age_onehot.show()
#   .setInputCols(Array("indexed_age", "indexed_income"))
#   .setOutputCols(Array("category_age", "category_income"))
#   .setDropLast(false)

# val encoded = oneHotEncoder.fit(data).transform(data)

### vector assembler

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
cols = ['indexed_age','indexed_income']
vec_assembler = VectorAssembler(inputCols= cols, outputCol= 'ageIncomeVec')
                   
df_age_income_vec = vec_assembler.transform(df_age_income_idx)
df_age_income_vec.show()

### Vector Indexer

In [None]:
from pyspark.ml.feature import VectorIndexer


In [None]:
vecIndexer = VectorIndexer(inputCol='ageIncomeVec',\
                           outputCol='indexed_ageIncomeVec',\
                           maxCategories=3)
df_age_income_vec_idx = vecIndexer.fit(df_age_income_vec).\
    transform(df_age_income_vec)  

df_age_income_vec_idx.show()


## Pipelines

#### string indexer

In [None]:
label = 'label'
features_col = data.columns
features_col.remove(label)

In [None]:
prefix = 'indexed_'

In [None]:
label_string_indexer = StringIndexer(inputCol=label, outputCol=prefix+label)

In [None]:
features_str_col = list(map(lambda c:prefix+c, features_col))
features_string_indexer = StringIndexer(inputCols=features_col,outputCols=features_str_col)


#### vector assembler and indexer

In [None]:
vec_assembler = VectorAssembler(inputCols= features_string_indexer.getOutputCols(), outputCol= 'vector')


In [None]:
vec_indexer = VectorIndexer(inputCol='vector',\
                            outputCol='features',\
                           maxCategories=3)

#### pipeline building

In [None]:
stages = [label_string_indexer,features_string_indexer,vec_assembler,vec_indexer]

In [None]:
from pyspark.ml import Pipeline 


In [None]:
pipeline = Pipeline(stages = stages)
train_data = pipeline.fit(data).transform(data).select("features","indexed_label")
train_data.show()


## DT inference

In [None]:
from pyspark.ml.classification import DecisionTreeClassificationModel, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [None]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol= "indexed_label")
dtModel = dt.fit(train_data)
dtModel

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c7ec3cc2c2ef, depth=4, numNodes=13, numClasses=2, numFeatures=4

In [None]:
print(dtModel.toDebugString)

## Model Selection and Tuning

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder



In [None]:

dt_paramGrid = ParamGridBuilder()\
        .addGrid(dt.maxBins, [40,42])\
        .addGrid(dt.minInstancesPerNode, [10,100]) \
        .build()
dt_paramGrid

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Use BinaryClassificationEvaluator to evaluate our model
evaluatorPR = BinaryClassificationEvaluator(labelCol = "indexed_label", rawPredictionCol = "prediction", metricName = "areaUnderPR")
evaluatorAUC = BinaryClassificationEvaluator(labelCol = "indexed_label", rawPredictionCol = "prediction", metricName = "areaUnderROC")


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# Build out the cross validation

#create k folds with k=5. 
cv = CrossValidator(estimator=dt, \
                    estimatorParamMaps=dt_paramGrid, \
                    evaluator=evaluatorPR, \
                    numFolds=5, \
                    parallelism=2)


In [None]:
cvModel = cv.fit(train_data)

In [None]:
bestModel = cvModel.bestModel
print(bestModel.toDebugString)

In [None]:
train_pred = cvModel.transform(train_data)
train_pred.show()