In [1]:
# !pip install pipenv
# !pipenv install


In [2]:
import os
import sys

import pyspark.pandas as ps
from pyspark.conf import SparkConf
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.sql import SparkSession




In [3]:
# Create PySpark SparkSession
sparkConf = (
    SparkConf()
    .setAppName("ECD_TCC")
    .setMaster("local[*]")
    .setAll([(k, v) for k, v in {
        "spark.driver.memory": "12g",
        "spark.executor.memory": "12g",
        "spark.sql.warehouse.dir": "spark-warehouse"
    }.items()])
)
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


In [4]:
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()


In [5]:
datasetsPath = "../datasets"
modelsPath = f"{datasetsPath}/model"
features_columns = list(spark.read.json(f"{datasetsPath}/features").toPandas()["value"])
features_columns.append("EVOLUCAO")
raw_data = spark.read.csv(f"{datasetsPath}/raw", header=True, sep=";").select(features_columns).cache()
training_data = spark.read.parquet(f"{datasetsPath}/training").cache()
test_data = spark.read.parquet(f"{datasetsPath}/test").cache()

In [6]:
predictionCols = ["label", "prediction", "rawPrediction", "probability", "features"]
prediction_cols = ["label", "prediction", "rawPrediction", "probability", "features"]
columnsMetrics = ["modelo", "tempo de treino", "f1", "accuracy", "weightedPrecision", "weightedRecall"]
evaluator = MulticlassClassificationEvaluator()


In [7]:
raw_data.pandas_api().head()


Unnamed: 0,CARDIOPATI,HEMATOLOGI,NEUROLOGIC,PNEUMOPATI,IMUNODEPRE,SIND_DOWN,OBESIDADE,HEPATICA,DIABETES,RENAL,ASMA,EVOLUCAO
0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1
1,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2
2,,,,,,,,,,,,1
3,,,,,,,,,1.0,,,1
4,,,,,,,,,,,,1


In [8]:
training_data.pandas_api().head()

Unnamed: 0,CARDIOPATI,HEMATOLOGI,SIND_DOWN,HEPATICA,ASMA,DIABETES,NEUROLOGIC,PNEUMOPATI,IMUNODEPRE,RENAL,OBESIDADE,EVOLUCAO,label,features
0,0,0,0,0,0,0,0,0,0,0,0,2,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0,0,0,0,0,0,0,0,0,0,0,2,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,0,0,0,0,0,0,0,0,0,0,0,2,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,0,0,0,0,0,0,0,0,0,0,0,2,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0,0,0,0,0,0,0,0,0,0,0,2,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
training_data.pandas_api().corr()


Unnamed: 0,CARDIOPATI,HEMATOLOGI,SIND_DOWN,HEPATICA,ASMA,DIABETES,NEUROLOGIC,PNEUMOPATI,IMUNODEPRE,RENAL,OBESIDADE,label
CARDIOPATI,1.0,0.015322,0.001932,0.019733,0.017607,0.295425,0.063263,0.083807,0.00322,0.109294,0.080128,0.144271
HEMATOLOGI,0.015322,1.0,0.03867,0.054904,0.020365,0.0084,0.022289,0.026266,0.090507,0.0314,0.00343,0.018789
SIND_DOWN,0.001932,0.03867,1.0,0.038698,0.022756,0.003409,0.018303,0.012879,0.009508,0.010215,0.012351,0.007439
HEPATICA,0.019733,0.054904,0.038698,1.0,0.017758,0.025551,0.019534,0.02651,0.050471,0.047893,0.007797,0.034582
ASMA,0.017607,0.020365,0.022756,0.017758,1.0,0.007869,0.002715,0.056648,0.001986,0.00546,0.050318,-0.011624
DIABETES,0.295425,0.0084,0.003409,0.025551,0.007869,1.0,0.018184,0.024878,-0.00186,0.095495,0.076988,0.119442
NEUROLOGIC,0.063263,0.022289,0.018303,0.019534,0.002715,0.018184,1.0,0.045663,0.020105,0.027833,-0.018928,0.071673
PNEUMOPATI,0.083807,0.026266,0.012879,0.02651,0.056648,0.024878,0.045663,1.0,0.038588,0.04225,0.012449,0.066175
IMUNODEPRE,0.00322,0.090507,0.009508,0.050471,0.001986,-0.00186,0.020105,0.038588,1.0,0.062111,-0.012802,0.046949
RENAL,0.109294,0.0314,0.010215,0.047893,0.00546,0.095495,0.027833,0.04225,0.062111,1.0,0.00772,0.081401


## Regressão Logística

In [10]:
lr_model = CrossValidatorModel.load(f"{modelsPath}/spark-model-lr")
lr_predictions = lr_model.transform(test_data)

metricsDF = spark.createDataFrame([[
    "regressão logística",
    3219,
    evaluator.evaluate(lr_predictions, {evaluator.metricName: "weightedFMeasure"}),
    evaluator.evaluate(lr_predictions, {evaluator.metricName: "accuracy"}),
    evaluator.evaluate(lr_predictions, {evaluator.metricName: "weightedPrecision"}),
    evaluator.evaluate(lr_predictions, {evaluator.metricName: "weightedRecall"})
]], columnsMetrics)
metricsDF.show()
metricsDF.write.mode("append").saveAsTable("metrics")

best_lr_pipeline = lr_model.bestModel
best_lr_model = best_lr_pipeline.stages[-1]
list_coefficients_lr = best_lr_model.coefficients
intercept = best_lr_model.intercept
columns_coefficients = list(zip(features_columns, list_coefficients_lr))
frame = ps.DataFrame(columns_coefficients, columns=["feature", "coefficient"])
print(f"Intercept: {intercept}")
frame.sort_values("coefficient")


+-------------------+---------------+------------------+-----------------+------------------+-----------------+
|             modelo|tempo de treino|                f1|         accuracy| weightedPrecision|   weightedRecall|
+-------------------+---------------+------------------+-----------------+------------------+-----------------+
|regressão logística|           3219|0.5939809636583079|0.594266296618053|0.5945083183974529|0.594266296618053|
+-------------------+---------------+------------------+-----------------+------------------+-----------------+

Intercept: -0.045680844076801515


Unnamed: 0,feature,coefficient
4,IMUNODEPRE,-0.002257
2,NEUROLOGIC,0.0
1,HEMATOLOGI,0.022409
10,ASMA,0.030099
8,DIABETES,0.048977
5,SIND_DOWN,0.054396
3,PNEUMOPATI,0.05636
0,CARDIOPATI,0.059478
7,HEPATICA,0.063412
6,OBESIDADE,0.064738


## Floresta aleatória

In [11]:
rf_model = CrossValidatorModel.load(f"{modelsPath}/spark-model-rf")
rf_predictions = rf_model.transform(test_data)
print(f"Avaliação do modelo de floresta aleatória: {evaluator.evaluate(rf_predictions)}")


Avaliação do modelo de floresta aleatória: 0.5939556127541818


## Multilayer perceptron

In [12]:
mlp_model = CrossValidatorModel.load(f"{modelsPath}/spark-model-mlp")
mlp_predictions = mlp_model.transform(test_data)
print(f"Avaliação do modelo de multilayer perceptron: {evaluator.evaluate(mlp_predictions)}")


Avaliação do modelo de multilayer perceptron: 0.5948099511166423
