## European Soccer Events Analysis: Machine Learning

In this notebook, we use [Gradient-boosted tree](https://spark.apache.org/docs/2.2.0/ml-classification-regression.html#gradient-boosted-tree-classifier) classifier to fit a model on transformed soccer events data, that could help predict whether a combination of on-field conditions lead to a goal or not.

In [2]:
%sql USE EURO_SOCCER_DB

In [3]:
%sql SELECT * FROM GAME_EVENTS

id_odsp,id_event,sort_order,time,event_type,event_type_str,event_type2,event_type2_str,side,side_str,event_team,opponent,player,player2,player_in,player_out,shot_place,shot_place_str,shot_outcome,shot_outcome_str,is_goal,location,location_str,bodypart,bodypart_str,assist_method,assist_method_str,situation,situation_str,time_bin,country_code
UFot0hit/,UFot0hit1,1,2,1,Attempt,12,Key Pass,2,Away,Hamburg SV,Borussia Dortmund,mladen petric,gokhan tore,,,6,High and wide,2,Off target,0,9,Left side of the box,2,Left foot,1,Pass,1,Open play,0.0,DEU
UFot0hit/,UFot0hit2,2,4,2,Corner,99,,1,Home,Borussia Dortmund,Hamburg SV,dennis diekmeier,dennis diekmeier,,,99,,99,,0,99,,99,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit3,3,4,2,Corner,99,,1,Home,Borussia Dortmund,Hamburg SV,heiko westermann,heiko westermann,,,99,,99,,0,99,,99,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit4,4,7,3,Foul,99,,1,Home,Borussia Dortmund,Hamburg SV,sven bender,,,,99,,99,,0,99,,99,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit5,5,7,8,Free kick won,99,,2,Away,Hamburg SV,Borussia Dortmund,gokhan tore,,,,99,,99,,0,2,Defensive half,99,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit6,6,9,10,Hand ball,99,,2,Away,Hamburg SV,Borussia Dortmund,jose paolo guerrero,,,,99,,99,,0,99,,99,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit7,7,10,2,Corner,99,,2,Away,Hamburg SV,Borussia Dortmund,lukasz piszczek,lukasz piszczek,,,99,,99,,0,99,,99,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit8,8,11,8,Free kick won,99,,1,Home,Borussia Dortmund,Hamburg SV,chris lowe,,,,99,,99,,0,2,Defensive half,99,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit9,9,11,3,Foul,99,,2,Away,Hamburg SV,Borussia Dortmund,gojko kacar,,,,99,,99,,0,99,,99,,0,,99,,0.0,DEU
UFot0hit/,UFot0hit10,10,13,3,Foul,99,,2,Away,Hamburg SV,Borussia Dortmund,gokhan tore,,,,99,,99,,0,99,,99,,0,,99,,1.0,DEU


In [4]:
gameEventsDf = spark.sql("select event_type_str, event_team, shot_place_str, location_str, assist_method_str, situation_str, country_code, is_goal from game_events")

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [6]:
categFeatures = ["event_type_str", "event_team", "shot_place_str", "location_str", "assist_method_str", "situation_str", "country_code"]

In [7]:
stringIndexers = [StringIndexer().setInputCol(baseFeature).setOutputCol(baseFeature + "_idx") for baseFeature in categFeatures]

In [8]:
encoders = [OneHotEncoder().setInputCol(baseFeature + "_idx").setOutputCol(baseFeature + "_vec") for baseFeature in categFeatures]

In [9]:
featureAssembler = VectorAssembler()
featureAssembler.setInputCols([baseFeature + "_vec" for baseFeature in categFeatures])
featureAssembler.setOutputCol("features")

In [10]:
gbtClassifier = GBTClassifier(labelCol="is_goal", featuresCol="features", maxDepth=5, maxIter=20)

pipelineStages = stringIndexers + encoders + [featureAssembler, gbtClassifier]
pipeline = Pipeline(stages=pipelineStages)

In [11]:
(trainingData, testData) = gameEventsDf.randomSplit([0.75, 0.25])
model = pipeline.fit(trainingData)

In [12]:
predictions = model.transform(testData)
display(predictions.select("prediction", "is_goal", "features"))

prediction,is_goal,features
0.0,0,"List(0, 195, List(2, 96, 161, 167, 185, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 161, 167, 185, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 161, 179, 183, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 152, 167, 185, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 152, 167, 185, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 152, 167, 185, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 152, 167, 186, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 152, 167, 183, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 152, 167, 184, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0.0,0,"List(0, 195, List(2, 96, 152, 167, 184, 188, 193), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"


In [13]:
evaluator = BinaryClassificationEvaluator(
    labelCol="is_goal", rawPredictionCol="prediction")
evaluator.evaluate(predictions)