Instituto Infnet

Disciplina Análise de dados com Spark

Aluno: Victor Luiz de Oliveira e Silva

In [1]:
!pip install pyspark
import pyspark



In [2]:
## Você só vai precisar dessa parte se fizer no Google Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Importando bibliotecas necessárias
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, desc , col, max, when, isnan, min, avg
spark = SparkSession.builder.appName("Exercicio").master('local[*]').getOrCreate()

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, OneHotEncoder, StandardScaler, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Este problema que iremos prover uma solução é sobre empregados que podem ou não pedir demissão de uma empresa de TI. Os dados coletados trazem informação de Nível de Educação do funcionário, ano que entrou na empresa, cidade que mora, nível de pagamento, idade, sexo, se já ficou no banco da empresa, anos de experiencia e a se saiu ou não da empresa (label).

In [5]:
#Carga dos dados
df = spark.read.csv(path='drive/MyDrive/Arquivos/Employee.csv',header=True,inferSchema=True,sep = ',')

In [6]:
#Verificação dos dados
df.show()

+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Bachelors|       2017|Bangalore|          3| 34|  Male|         No|                        0|         0|
|Bachelors|       2013|     Pune|          1| 28|Female|         No|                        3|         1|
|Bachelors|       2014|New Delhi|          3| 38|Female|         No|                        2|         0|
|  Masters|       2016|Bangalore|          3| 27|  Male|         No|                        5|         1|
|  Masters|       2017|     Pune|          3| 24|  Male|        Yes|                        2|         1|
|Bachelors|       2016|Bangalore|          3| 22|  Male|         No|                        0|         0|
|Bachelors|       2015|New Delhi|          3| 

In [7]:
df.printSchema()

root
 |-- Education: string (nullable = true)
 |-- JoiningYear: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- PaymentTier: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- EverBenched: string (nullable = true)
 |-- ExperienceInCurrentDomain: integer (nullable = true)
 |-- LeaveOrNot: integer (nullable = true)



In [8]:
#Transformação de atributos categóricos  em uma única coluna
GenderIndex = StringIndexer(inputCols=["Gender"], outputCols=["gender_vec"])
EverBenchedIndex = StringIndexer(inputCols=["EverBenched"], outputCols=["everbenched_vec"])
CityIndexer = StringIndexer(inputCols=["City"],outputCols=["city_vec"])
EducationIndexer = StringIndexer(inputCols=["Education"],outputCols=["education_vec"])
LabelIndexer =  StringIndexer(inputCol="LeaveOrNot", outputCol="indexedLabel")
feat_cols = ["city_vec","JoiningYear","PaymentTier","everbenched_vec","Age","education_vec","ExperienceInCurrentDomain","gender_vec"]
#Montando o vetor de features
assembler = VectorAssembler(inputCols=feat_cols,outputCol="features")

In [9]:
#Divisão dos dados em treino e teste para aplicação do modelo
(trainingData, testData) = df.randomSplit([0.7, 0.3])
print(trainingData.count(), testData.count())

3220 1433


In [10]:
#Iremos utilizar o classificador Random Forest para fazer as classificações
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=20)


In [11]:
#Montando o vetor de features
pipeline = Pipeline(stages=[GenderIndex,EverBenchedIndex,CityIndexer,EducationIndexer,LabelIndexer,assembler, rf])

In [12]:
#Executando o fit do pipeline nos dados de treino
model = pipeline.fit(trainingData)

In [13]:
predictions = model.transform(testData)
predictions.show()

+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+----------+---------------+--------+-------------+------------+--------------------+--------------------+--------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|gender_vec|everbenched_vec|city_vec|education_vec|indexedLabel|            features|       rawPrediction|         probability|prediction|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+----------+---------------+--------+-------------+------------+--------------------+--------------------+--------------------+----------+
|Bachelors|       2012|Bangalore|          1| 27|  Male|         No|                        5|         0|       0.0|            0.0|     0.0|          0.0|         0.0|(8,[1,2,4,6],[201...|[14.0198068030814...|[0.70099034015407...|       0.0|
|Bachelors|       2012|Banga

In [14]:
predictions.select("prediction", "IndexedLabel", "features").show(20)


+----------+------------+--------------------+
|prediction|IndexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       1.0|         0.0|[0.0,2012.0,1.0,0...|
|       1.0|         0.0|[0.0,2012.0,1.0,1...|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0|         0.0|(8,[1,2,4],[2012....|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0|         1.0|(8,[1,2,4],[2012....|
|       0.0|         1.0|[0.0,2012.0,3.0,0...|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0|         0.0|[0.0,2012.0,3.0,0...|
|       0.0|         1.0|[0.0,2012.0,3.0,0...|
|       0.0|         0.0|[0.0,2012.0,3.0,1...|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0|         0.0|(8,[1,2,4,6],[201...|
|       0.0| 

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[2]
print(rfModel)  # summary only

Test Error = 0.183531
StringIndexerModel: uid=StringIndexer_c54363bf1b84, handleInvalid=error, numInputCols=1, numOutputCols=1
