In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, explode, flatten, explode_outer
from pyspark.sql.types import ArrayType, IntegerType, StringType, StructType, StructField, DoubleType, LongType, MapType, BooleanType
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.functions import col, concat
import json



# Start Spark Session

In [239]:
spark = SparkSession.builder.appName('lol').master("local").getOrCreate()
sqlContext = SQLContext(spark)


# EXTRACT DATA

# Define initial Schema

In [518]:
match_schema = StructType(
    [
        StructField('_c0', IntegerType(), True),
        StructField('gameCreation', DoubleType(), True),
        StructField('gameDuration', DoubleType(), True),
        StructField('gameId', DoubleType(), True),
        StructField('gameMode', StringType(), True),
        StructField('gameType', StringType(), True),
        StructField('gameVersion', StringType(), True),
        StructField('mapId', DoubleType(), True),
        StructField('participantIdentities', StringType(), True),
        StructField('participants',  StringType(), True),
        StructField('platformId', StringType(), True),
        StructField('queueId', DoubleType(), True),
        StructField('seasonId', DoubleType(), True),
        StructField('status.message', StringType(), True),
        StructField('status.status_code', StringType(), True)
    ]
)

loser_winner_schema = StructType(
    [
        StructField('_c0', IntegerType(), True),
        StructField('teamId', IntegerType(), True),
        StructField('win', StringType(), True),
        StructField('firstBlood', BooleanType(), True),
        StructField('firstTower', BooleanType(), True),
        StructField('firstInhibitor', BooleanType(), True),
        StructField('firstBaron', BooleanType(), True),
        StructField('firstDragon', BooleanType(), True),
        StructField('firstRiftHerald', BooleanType(), True),
        StructField('towerKills',  IntegerType(), True),
        StructField('inhibitorKills', IntegerType(), True),
        StructField('baronKills', IntegerType(), True),
        StructField('dragonKills', IntegerType(), True),
        StructField('vilemawKills', IntegerType(), True),
        StructField('riftHeraldKills', IntegerType(), True),
        StructField('dominionVictoryScore', IntegerType(), True),
        StructField('bans', StringType()),
        StructField('gameId', DoubleType(), True)

    ]
)


itens_schema = StructType(
    [
        StructField('_c0', IntegerType(), True),
        StructField('item_id', IntegerType(), True),
        StructField('name', StringType(), True),
        StructField('upper_item', StringType(), True),
        StructField('explain', StringType(), True),
        StructField('buy_price', IntegerType(), True),
        StructField('sell_price', IntegerType(), True),
        StructField('tag', StringType(), True)
    ]
)

In [519]:
# read match data
match_data = spark.read.csv("../data/match_data_version1.csv",
                    header='true',
                    schema=match_schema)
match_data = match_data.sample(False, 0.1, seed=0)

itens = spark.read.csv("../data/riot_item.csv",
                    header='true',
                    schema=itens_schema)

champions = spark.read.csv("../data/riot_champion.csv",
                    header='true',
                    inferSchema=True)


# read winner and loser match dadas
# winner and losser files uses the same schema
loser = spark.read.csv("../data/match_loser_data_version1.csv",
                    header='true',
                    schema=loser_winner_schema)
winner = spark.read.csv("../data/match_winner_data_version1.csv",
                    header='true',
                    schema=loser_winner_schema)

In [520]:
champions.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- version: string (nullable = true)
 |-- id: string (nullable = true)
 |-- key: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- blurb: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- partype: string (nullable = true)
 |-- info.attack: integer (nullable = true)
 |-- info.defense: integer (nullable = true)
 |-- info.magic: integer (nullable = true)
 |-- info.difficulty: integer (nullable = true)
 |-- image.full: string (nullable = true)
 |-- image.sprite: string (nullable = true)
 |-- image.group: string (nullable = true)
 |-- image.x: integer (nullable = true)
 |-- image.y: integer (nullable = true)
 |-- image.w: integer (nullable = true)
 |-- image.h: integer (nullable = true)
 |-- stats.hp: double (nullable = true)
 |-- stats.hpperlevel: integer (nullable = true)
 |-- stats.mp: double (nullable = true)
 |-- stats.mpperlevel: double (nullable = true)
 |-- stats.movespee

# Transform

Let`s define some utility functions. 

In [521]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyspark.sql.functions as F

# Convenience function for turning JSON strings into DataFrames.
# https://docs.databricks.com/_static/notebooks/transform-complex-data-types-scala.html
def jsonToDataFrame(json_input, schema=None):
    # SparkSessions are available with Spark 2.0+
    reader = spark.read
    if schema:
        reader.schema(schema)
    return reader.json(sc.parallelize([json_input]))


# Convenience function flatten dataframes with structs.
#https://stackoverflow.com/questions/38753898/how-to-flatten-a-struct-in-a-spark-dataframe
def flatten_df(nested_df):
    stack = [((), nested_df)]
    columns = []

    while len(stack) > 0:
        parents, df = stack.pop()

        flat_cols = [
            col(".".join(parents + (c[0],))).alias("_".join(parents + (c[0],)))
            for c in df.dtypes
            if c[1][:6] != "struct"
        ]

        nested_cols = [
            c[0]
            for c in df.dtypes
            if c[1][:6] == "struct"
        ]

        columns.extend(flat_cols)

        for nested_col in nested_cols:
            projected_df = df.select(nested_col + ".*")
            stack.append((parents + (nested_col,), projected_df))

    return nested_df.select(columns)

## Extract new schema for loser and winner data

In [522]:
#convert the dict string into json string
convert_json_string = udf(lambda colum: "{"+",".join([('"'+"championId" + str(x["pickTurn"]))+'"'+":" +str(x["championId"])+"" for x in eval(colum)])  +"}", StringType())
loser = loser.withColumn("bans", convert_json_string(loser.bans))

### Get schema from first row

In [523]:
# get the first row and transform into pyspark Dataframe, then extract the schema
bans_schema = jsonToDataFrame(loser.select(loser.bans).take(1)[0].asDict()["bans"]).schema

In [524]:
# applay the extracted schema to all rows
eval_column_bans = udf(lambda x : eval(x), bans_schema)

loser = loser.withColumn('bans', eval_column_bans(loser.bans))
winner = winner.withColumn('bans', eval_column_bans(winner.bans))

In [525]:
loser = flatten_df(loser)
winner = flatten_df(winner)

In [526]:
loser.printSchema()


root
 |-- _c0: integer (nullable = true)
 |-- teamId: integer (nullable = true)
 |-- win: string (nullable = true)
 |-- firstBlood: boolean (nullable = true)
 |-- firstTower: boolean (nullable = true)
 |-- firstInhibitor: boolean (nullable = true)
 |-- firstBaron: boolean (nullable = true)
 |-- firstDragon: boolean (nullable = true)
 |-- firstRiftHerald: boolean (nullable = true)
 |-- towerKills: integer (nullable = true)
 |-- inhibitorKills: integer (nullable = true)
 |-- baronKills: integer (nullable = true)
 |-- dragonKills: integer (nullable = true)
 |-- vilemawKills: integer (nullable = true)
 |-- riftHeraldKills: integer (nullable = true)
 |-- dominionVictoryScore: integer (nullable = true)
 |-- gameId: double (nullable = true)
 |-- bans_championId1: long (nullable = true)
 |-- bans_championId2: long (nullable = true)
 |-- bans_championId3: long (nullable = true)
 |-- bans_championId4: long (nullable = true)
 |-- bans_championId5: long (nullable = true)



## Extract new schema for match data

In [527]:
# Rename columns with '.'
match_data = match_data.withColumnRenamed("status.message", "status_message")
match_data = match_data.withColumnRenamed("status.status_code", "status_status_code")

get first row of participants, convert into json, then converti into dataframe and then extract the schema

In [528]:
participants = match_data.select(match_data.participants)
str_participants = participants.take(1)[0].asDict()["participants"]
participants = jsonToDataFrame(json.dumps(eval(str_participants)))
schema = participants.schema

In [529]:
participants = match_data.select(match_data.participantIdentities)
str_participantIdentities = participants.take(1)[0].asDict()["participantIdentities"]
participants = jsonToDataFrame(json.dumps(eval(str_participantIdentities)))
schema_participantIdentities = participants.schema

In [530]:
eval_column_participants = udf(lambda x : eval(x), ArrayType(schema))

match_data = match_data.withColumn('participants', eval_column_participants(match_data.participants))


In [531]:
eval_column_schema_participantIdentities = udf(lambda x : eval(x), ArrayType(schema_participantIdentities))

match_data = match_data.withColumn('participantIdentities', eval_column_schema_participantIdentities(match_data.participantIdentities))


In [532]:
# new schema
match_data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- gameCreation: double (nullable = true)
 |-- gameDuration: double (nullable = true)
 |-- gameId: double (nullable = true)
 |-- gameMode: string (nullable = true)
 |-- gameType: string (nullable = true)
 |-- gameVersion: string (nullable = true)
 |-- mapId: double (nullable = true)
 |-- participantIdentities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- participantId: long (nullable = true)
 |    |    |-- player: struct (nullable = true)
 |    |    |    |-- accountId: string (nullable = true)
 |    |    |    |-- currentAccountId: string (nullable = true)
 |    |    |    |-- currentPlatformId: string (nullable = true)
 |    |    |    |-- matchHistoryUri: string (nullable = true)
 |    |    |    |-- platformId: string (nullable = true)
 |    |    |    |-- profileIcon: long (nullable = true)
 |    |    |    |-- summonerId: string (nullable = true)
 |    |    |    |-- summonerName: string (nullable = true

In [533]:
combine = udf(lambda x, y: list(zip(x, y)),ArrayType(StructType([StructField("ids", schema),
                                    StructField("info", schema_participantIdentities)]))
             )
match_data = match_data.withColumn("participants_info", combine("participants", "participantIdentities"))


In [534]:
columns_to_drop = ['participants', 'participantIdentities']
match_data = match_data.drop(*columns_to_drop)
match_data = match_data.withColumn("participants_info", explode("participants_info"))


In [535]:
match_data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- gameCreation: double (nullable = true)
 |-- gameDuration: double (nullable = true)
 |-- gameId: double (nullable = true)
 |-- gameMode: string (nullable = true)
 |-- gameType: string (nullable = true)
 |-- gameVersion: string (nullable = true)
 |-- mapId: double (nullable = true)
 |-- platformId: string (nullable = true)
 |-- queueId: double (nullable = true)
 |-- seasonId: double (nullable = true)
 |-- status_message: string (nullable = true)
 |-- status_status_code: string (nullable = true)
 |-- participants_info: struct (nullable = true)
 |    |-- ids: struct (nullable = true)
 |    |    |-- championId: long (nullable = true)
 |    |    |-- participantId: long (nullable = true)
 |    |    |-- spell1Id: long (nullable = true)
 |    |    |-- spell2Id: long (nullable = true)
 |    |    |-- stats: struct (nullable = true)
 |    |    |    |-- assists: long (nullable = true)
 |    |    |    |-- champLevel: long (nullable = true)
 |    |    |  

In [536]:
# flatten structs
match_data=flatten_df(match_data)
match_data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- gameCreation: double (nullable = true)
 |-- gameDuration: double (nullable = true)
 |-- gameId: double (nullable = true)
 |-- gameMode: string (nullable = true)
 |-- gameType: string (nullable = true)
 |-- gameVersion: string (nullable = true)
 |-- mapId: double (nullable = true)
 |-- platformId: string (nullable = true)
 |-- queueId: double (nullable = true)
 |-- seasonId: double (nullable = true)
 |-- status_message: string (nullable = true)
 |-- status_status_code: string (nullable = true)
 |-- participants_info_info_participantId: long (nullable = true)
 |-- participants_info_info_player_accountId: string (nullable = true)
 |-- participants_info_info_player_currentAccountId: string (nullable = true)
 |-- participants_info_info_player_currentPlatformId: string (nullable = true)
 |-- participants_info_info_player_matchHistoryUri: string (nullable = true)
 |-- participants_info_info_player_platformId: string (nullable = true)
 |-- particip

In [537]:
itens_dict = itens.select("item_id", "name").distinct().collect()
itens_dict = {v["item_id"]:v["name"] for v in itens_dict}


In [538]:
champions_dict = champions.select("key", "name").distinct().collect()
champions_dict = {v["key"]:v["name"] for v in champions_dict}


In [539]:
def transform(x):
    try:
        value = itens_dict[int(x)] 
    except:
        value = "Name Not Found"
    return value


new_cols_itens = udf(lambda x : transform(x), StringType())

In [540]:
match_data = match_data.withColumn("name_item0", new_cols_itens(col("participants_info_ids_stats_item0")))
match_data = match_data.withColumn("name_item1", new_cols_itens(col("participants_info_ids_stats_item1")))
match_data = match_data.withColumn("name_item2", new_cols_itens(col("participants_info_ids_stats_item2")))
match_data = match_data.withColumn("name_item3", new_cols_itens(col("participants_info_ids_stats_item3")))
match_data = match_data.withColumn("name_item4", new_cols_itens(col("participants_info_ids_stats_item4")))
match_data = match_data.withColumn("name_item5", new_cols_itens(col("participants_info_ids_stats_item5")))
match_data = match_data.withColumn("name_item6", new_cols_itens(col("participants_info_ids_stats_item6")))

In [541]:
def transform_champions(x):
    try:
        value = champions_dict[int(x)] 
    except:
        value = "Name Not Found"
    return value


new_cols_champions = udf(lambda x : transform_champions(x), StringType())

match_data = match_data.withColumn("name_champion", new_cols_champions(col("participants_info_ids_championId")))

In [543]:
match_data.select("name_champion").head(4)

[Row(name_champion='LeBlanc'),
 Row(name_champion='Olaf'),
 Row(name_champion='Kalista'),
 Row(name_champion='Shen')]

# Merge DataFrames

In [548]:
winner = match_data.join(winner, on=['gameId'], how='inner')
loser = match_data.join(loser, on=['gameId'], how='inner')

In [549]:
winner.printSchema()

root
 |-- gameId: double (nullable = true)
 |-- _c0: integer (nullable = true)
 |-- gameCreation: double (nullable = true)
 |-- gameDuration: double (nullable = true)
 |-- gameMode: string (nullable = true)
 |-- gameType: string (nullable = true)
 |-- gameVersion: string (nullable = true)
 |-- mapId: double (nullable = true)
 |-- platformId: string (nullable = true)
 |-- queueId: double (nullable = true)
 |-- seasonId: double (nullable = true)
 |-- status_message: string (nullable = true)
 |-- status_status_code: string (nullable = true)
 |-- participants_info_info_participantId: long (nullable = true)
 |-- participants_info_info_player_accountId: string (nullable = true)
 |-- participants_info_info_player_currentAccountId: string (nullable = true)
 |-- participants_info_info_player_currentPlatformId: string (nullable = true)
 |-- participants_info_info_player_matchHistoryUri: string (nullable = true)
 |-- participants_info_info_player_platformId: string (nullable = true)
 |-- particip

In [551]:
# Register the DataFrame as a SQL temporary view
match_data.createOrReplaceTempView("match_data")
winner.createOrReplaceTempView("winner")
loser.createOrReplaceTempView("loser")

In [57]:
players = sqlContext.sql("""
                            SELECT victorys.id as user, victorys.championId as champion_id, victorys.won_matches, matches.total_matches, victorys.won_matches/matches.total_matches as win_rate \
                            FROM \
                                (SELECT match_data.participants_info_info_player_accountId as id, match_data.participants_info_ids_championId as championID, COUNT(DISTINCT(match_data.gameId)) as won_matches \
                                FROM match_data \
                                WHERE match_data.participants_info_ids_stats_win == true \
                                GROUP BY match_data.participants_info_info_player_accountId, match_data.participants_info_ids_championId) as victorys \
                            LEFT JOIN (SELECT match_data.participants_info_info_player_accountId as id, match_data.participants_info_ids_championId as championID, COUNT(DISTINCT(match_data.gameId)) as total_matches \
                                       FROM match_data \
                                       GROUP BY match_data.participants_info_info_player_accountId, match_data.participants_info_ids_championId) as matches \
                            ON victorys.id=matches.id AND victorys.championID = matches.championID
                            ORDER BY matches.total_matches DESC
                        """) 
players.createOrReplaceTempView("players")
players.show()

+--------------------+-----------+-----------+-------------+-------------------+
|                user|champion_id|won_matches|total_matches|           win_rate|
+--------------------+-----------+-----------+-------------+-------------------+
|Kre-yBBl_dth1hTu8...|        245|         21|           43| 0.4883720930232558|
|qTg_APziIefWzPLf9...|         12|         24|           41| 0.5853658536585366|
|HO0Zipwm_4RS7iUAX...|        104|         23|           40|              0.575|
|k1clnQm6WYzfh-3jD...|        154|         18|           37| 0.4864864864864865|
|Dli4ZNwRdA8FOkKO-...|         34|         12|           35|0.34285714285714286|
|-K5Dcqaa_SYxpKQUG...|        555|         14|           35|                0.4|
|5VLbcHCjOrxiV5EVj...|        245|         20|           35| 0.5714285714285714|
|t_cBLaPpqGjJHZn-T...|        119|         16|           35|0.45714285714285713|
|QDzzlcKqSQPsU1j4F...|         69|         20|           31| 0.6451612903225806|
|eUEOhQpye02aeTnO-...|      

In [556]:
connector = "-"
build = sqlContext.sql("""
                          SELECT build.championName, build.build_name, COUNT(build.build_name) as total_matches \
                          FROM \
                              (SELECT match_data.name_champion as championName, CONCAT(match_data.name_item0, "%s",
                                             match_data.name_item1, "%s",
                                             match_data.name_item2, "%s",
                                             match_data.name_item3, "%s",
                                             match_data.name_item4, "%s",
                                             match_data.name_item5, "%s",
                                             match_data.name_item6) as build_name \
                              FROM match_data \
                              WHERE match_data.participants_info_ids_stats_item0 IS NOT NULL \
                              AND match_data.participants_info_ids_stats_item1 IS NOT NULL
                              AND match_data.participants_info_ids_stats_item2 IS NOT NULL
                              AND match_data.participants_info_ids_stats_item3 IS NOT NULL
                              AND match_data.participants_info_ids_stats_item4 IS NOT NULL
                              AND match_data.participants_info_ids_stats_item5 IS NOT NULL
                              AND match_data.participants_info_ids_stats_item6 IS NOT NULL
                              AND match_data.participants_info_ids_stats_item0 != 0
                              AND match_data.participants_info_ids_stats_item1 != 0
                              AND match_data.participants_info_ids_stats_item2 != 0
                              AND match_data.participants_info_ids_stats_item3 != 0
                              AND match_data.participants_info_ids_stats_item4 != 0
                              AND match_data.participants_info_ids_stats_item5 != 0
                              AND match_data.participants_info_ids_stats_item6 != 0
                              AND match_data.participants_info_info_player_accountId 
                              IN ( \
                                   SELECT players.user
                                   FROM players \
                                   WHERE players.win_rate > 0.5 AND players.total_matches > 5)) as build \
                         GROUP BY build.championName, build.build_name \
                         ORDER BY total DESC
                    """ % tuple([connector]*6))
build.show()

+--------------------------+--------------------+-----+
|first(championName, false)|          build_name|total|
+--------------------------+--------------------+-----+
|                     Janna|Boots of Mobility...|    3|
|                     Janna|Boots of Swiftnes...|    3|
|                     Jayce|Corrupting Potion...|    2|
|                  Pantheon|Ninja Tabi-Youmuu...|    2|
|                      Pyke|Runesteel Spaulde...|    2|
|                    Illaoi|Black Cleaver-Cor...|    2|
|                  Pantheon|Knight's Vow-Brok...|    2|
|                     Elise|Enchantment: Runi...|    2|
|                       Jax|Trinity Force-Bil...|    2|
|                      Fizz|Corrupting Potion...|    2|
|                     Talon|Tiamat-Youmuu's G...|    2|
|                Blitzcrank|Pauldrons of Whit...|    2|
|                      Fizz|Sheen-Zhonya's Ho...|    2|
|                      Ahri|Hextech GLP-800-B...|    2|
|                      Pyke|Youmuu's Ghostbla...