In [158]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.sql import SparkSession

In [159]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
table = spark.read.csv("data/games.csv",
                       header=True,
                       inferSchema=True).persist()

In [160]:
table.printSchema()

root
 |-- gameId: long (nullable = true)
 |-- creationTime: long (nullable = true)
 |-- gameDuration: integer (nullable = true)
 |-- seasonId: integer (nullable = true)
 |-- winner: integer (nullable = true)
 |-- firstBlood: integer (nullable = true)
 |-- firstTower: integer (nullable = true)
 |-- firstInhibitor: integer (nullable = true)
 |-- firstBaron: integer (nullable = true)
 |-- firstDragon: integer (nullable = true)
 |-- firstRiftHerald: integer (nullable = true)
 |-- t1_champ1id: integer (nullable = true)
 |-- t1_champ1_sum1: integer (nullable = true)
 |-- t1_champ1_sum2: integer (nullable = true)
 |-- t1_champ2id: integer (nullable = true)
 |-- t1_champ2_sum1: integer (nullable = true)
 |-- t1_champ2_sum2: integer (nullable = true)
 |-- t1_champ3id: integer (nullable = true)
 |-- t1_champ3_sum1: integer (nullable = true)
 |-- t1_champ3_sum2: integer (nullable = true)
 |-- t1_champ4id: integer (nullable = true)
 |-- t1_champ4_sum1: integer (nullable = true)
 |-- t1_champ4_sum2

In [161]:
table.show()

+----------+-------------+------------+--------+------+----------+----------+--------------+----------+-----------+---------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-------------+-----------------+-------------+--------------+------------------+-------+-------+-------+-------+-------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-------------+-----------------+-------------+--------------+------------------+-------+-------+-------+-------+-------+
|    gameId| creationTime|gameDuration|seasonId|winner|firstBlood|firstTower|firstInhibitor|firstBaron|firstDragon|firstRiftHerald|t1_champ1id|t1_champ1_sum1|t1_champ1_sum2|t1_champ2id|t1_champ2_sum1|t1_champ

In [162]:
seed = 11011990
(dataTrain, dataTest) = table.randomSplit((0.8, 0.2), seed)

In [163]:
dataTrain.show()

+----------+-------------+------------+--------+------+----------+----------+--------------+----------+-----------+---------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-------------+-----------------+-------------+--------------+------------------+-------+-------+-------+-------+-------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-------------+-----------------+-------------+--------------+------------------+-------+-------+-------+-------+-------+
|    gameId| creationTime|gameDuration|seasonId|winner|firstBlood|firstTower|firstInhibitor|firstBaron|firstDragon|firstRiftHerald|t1_champ1id|t1_champ1_sum1|t1_champ1_sum2|t1_champ2id|t1_champ2_sum1|t1_champ

In [164]:
dataTest.show()

+----------+-------------+------------+--------+------+----------+----------+--------------+----------+-----------+---------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-------------+-----------------+-------------+--------------+------------------+-------+-------+-------+-------+-------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-------------+-----------------+-------------+--------------+------------------+-------+-------+-------+-------+-------+
|    gameId| creationTime|gameDuration|seasonId|winner|firstBlood|firstTower|firstInhibitor|firstBaron|firstDragon|firstRiftHerald|t1_champ1id|t1_champ1_sum1|t1_champ1_sum2|t1_champ2id|t1_champ2_sum1|t1_champ

In [165]:
columns = list(filter(lambda x: x not in ("gameId", "creationTime", "seasonId", "winner"), table.columns))
columns

['gameDuration',
 'firstBlood',
 'firstTower',
 'firstInhibitor',
 'firstBaron',
 'firstDragon',
 'firstRiftHerald',
 't1_champ1id',
 't1_champ1_sum1',
 't1_champ1_sum2',
 't1_champ2id',
 't1_champ2_sum1',
 't1_champ2_sum2',
 't1_champ3id',
 't1_champ3_sum1',
 't1_champ3_sum2',
 't1_champ4id',
 't1_champ4_sum1',
 't1_champ4_sum2',
 't1_champ5id',
 't1_champ5_sum1',
 't1_champ5_sum2',
 't1_towerKills',
 't1_inhibitorKills',
 't1_baronKills',
 't1_dragonKills',
 't1_riftHeraldKills',
 't1_ban1',
 't1_ban2',
 't1_ban3',
 't1_ban4',
 't1_ban5',
 't2_champ1id',
 't2_champ1_sum1',
 't2_champ1_sum2',
 't2_champ2id',
 't2_champ2_sum1',
 't2_champ2_sum2',
 't2_champ3id',
 't2_champ3_sum1',
 't2_champ3_sum2',
 't2_champ4id',
 't2_champ4_sum1',
 't2_champ4_sum2',
 't2_champ5id',
 't2_champ5_sum1',
 't2_champ5_sum2',
 't2_towerKills',
 't2_inhibitorKills',
 't2_baronKills',
 't2_dragonKills',
 't2_riftHeraldKills',
 't2_ban1',
 't2_ban2',
 't2_ban3',
 't2_ban4',
 't2_ban5']

In [166]:
assembler = VectorAssembler(
    inputCols=columns,
    outputCol="features"
)

In [167]:
indexer = StringIndexer(
    inputCol="winner",
    outputCol="label"
)

In [168]:
randomForestClassifier = RandomForestClassifier(
    impurity="gini",
    maxDepth=3,
    numTrees=20,
    featureSubsetStrategy="auto"
)

In [169]:
stages = (assembler, indexer, randomForestClassifier)

In [170]:
pipeline = Pipeline(stages=stages)

In [171]:
evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    metricName="areaUnderROC"
)

In [172]:
paramGrid = ParamGridBuilder().addGrid(
    randomForestClassifier.maxBins, [25, 28, 31]
).addGrid(
    randomForestClassifier.maxDepth, [4, 6, 8]
).addGrid(
    randomForestClassifier.impurity, ["entropy", "gini"]
).build()

In [173]:
cv = CrossValidator(
    estimator=pipeline,
    evaluator=evaluator,
    estimatorParamMaps=paramGrid,
    numFolds=5
)

In [174]:
cvModel = cv.fit(dataTrain)

In [175]:
cvPredictionDf = cvModel.transform(dataTest)
cvPredictionDf.show()

+----------+-------------+------------+--------+------+----------+----------+--------------+----------+-----------+---------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-------------+-----------------+-------------+--------------+------------------+-------+-------+-------+-------+-------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-----------+--------------+--------------+-------------+-----------------+-------------+--------------+------------------+-------+-------+-------+-------+-------+--------------------+-----+--------------------+--------------------+----------+
|    gameId| creationTime|gameDuration|seasonId|winner|firstBlood|firstTower|firstInhibitor|firstBaron|firstDragon|firstRiftHera

In [178]:
evaluator.evaluate(cvPredictionDf)

0.9967855903570569

In [None]:
cvModel.write().overwrite().save("data/model/lol")

In [177]:
# carregar o modelo do diretorio
# cvModelLoaded = CrossValidatorModel.load("data/model/lol")