## Limpeza de dados Cartola ano 2014

In [176]:
from pyspark.sql import HiveContext
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import lower, col, lit, regexp_replace, trim, substring, when, expr, udf, count, sum, monotonically_increasing_id
import pandas as pd
import json
import requests

## Funções Custonmizadas

In [94]:
def remove_after_hiphen(team_name):
    if team_name.startswith('atletico') or team_name.startswith('atl')  or team_name.startswith('Atl') or team_name.startswith('Ath'):
        return team_name
    else:
        return team_name.split('-', 1)[0]

In [95]:
#Remove dos valores das Strings o que estiver após os hiphen
remove_hiphen_udf = udf(remove_after_hiphen, StringType())
#partidas_2014_ct = partidas_2014_df.withColumn('away_team', remove_hiphen_udf(partidas_2014_df['away_team']))

## Jogadores

In [120]:
#Carrega arquivo CSV
jogadores_2014_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2014/2014_jogadores.csv", header=True)
sorted_jogadores_2014_df = jogadores_2014_df.sort(jogadores_2014_df.ID.asc())


In [122]:
#Adiciona coluna ANO = 2014
jogadores_2014_ano = sorted_jogadores_2014_df.withColumn('year', lit(2014))
jogadores_2014_ano.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,36443,Dida,285,1,2014
1,36522,Alex,294,4,2014
2,36540,Juan,285,3,2014
3,36612,Zé Roberto,284,4,2014
4,36728,Índio,285,3,2014
...,...,...,...,...,...
1028,89921,Maikon,263,5,2014
1029,89924,Bruno,283,4,2014
1030,89932,JacÃ³,265,5,2014
1031,89975,Rodolfo,282,1,2014


In [123]:
#Alterar ID para int.(está string)
jogadores_2014_ano = jogadores_2014_ano.withColumn("ID", jogadores_2014_ano["ID"].cast(IntegerType()))
jogadores_2014_ano = jogadores_2014_ano.withColumn("ClubeID", jogadores_2014_ano["ClubeID"].cast(IntegerType()))
jogadores_2014_ano = jogadores_2014_ano.withColumn("PosicaoID", jogadores_2014_ano["PosicaoID"].cast(IntegerType()))

In [124]:
jogadores_2014_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/jogadores/')

In [125]:
#transforma arquivo em parquet
jogadores_2014_parquet = spark.read.option('basePath', '/cartola/clean/jogadores/').parquet('/cartola/clean/jogadores/*')

In [126]:
jogadores_2014_parquet.show()
jogadores_2014_parquet.schema


+-----+--------------------+-------+---------+----+
|   ID|             Apelido|ClubeID|PosicaoID|year|
+-----+--------------------+-------+---------+----+
|83266|   Marcos Guilherme |    293|        4|2014|
|83439|              Maykon|    294|        5|2014|
|83471|            Paulinho|    264|        5|2014|
|83506|       Vítor Michels|    288|        4|2014|
|83511|        Érico Júnior|    292|        5|2014|
|83522|       Fagner Alemão|    315|        3|2014|
|51683|        Bruno Rangel|    315|        5|2014|
|51705|       Bruno Rodrigo|    283|        3|2014|
|51772|     Éverton Ribeiro|    283|        4|2014|
|51779|       Pedro Botelho|    282|        2|2014|
|51781|               Ávine|    265|        2|2014|
|51985|      Anderson Pedra|    292|        4|2014|
|40904|         Julio Cesar|    294|        5|2014|
|40990|      Dorival Júnior|    275|        6|2014|
|41126|        Rodrigo Gral|    315|        5|2014|
|41218|              Magrão|    292|        1|2014|
|41327|Vande

StructType(List(StructField(ID,IntegerType,true),StructField(Apelido,StringType,true),StructField(ClubeID,IntegerType,true),StructField(PosicaoID,IntegerType,true),StructField(year,IntegerType,true)))

In [127]:
jogadores_2014_parquet.schema

StructType(List(StructField(ID,IntegerType,true),StructField(Apelido,StringType,true),StructField(ClubeID,IntegerType,true),StructField(PosicaoID,IntegerType,true),StructField(year,IntegerType,true)))

In [128]:
jogadores_2014_parquet.show()

print(jogadores_2014_parquet.count())

+-----+--------------------+-------+---------+----+
|   ID|             Apelido|ClubeID|PosicaoID|year|
+-----+--------------------+-------+---------+----+
|83266|   Marcos Guilherme |    293|        4|2014|
|83439|              Maykon|    294|        5|2014|
|83471|            Paulinho|    264|        5|2014|
|83506|       Vítor Michels|    288|        4|2014|
|83511|        Érico Júnior|    292|        5|2014|
|83522|       Fagner Alemão|    315|        3|2014|
|51683|        Bruno Rangel|    315|        5|2014|
|51705|       Bruno Rodrigo|    283|        3|2014|
|51772|     Éverton Ribeiro|    283|        4|2014|
|51779|       Pedro Botelho|    282|        2|2014|
|51781|               Ávine|    265|        2|2014|
|51985|      Anderson Pedra|    292|        4|2014|
|40904|         Julio Cesar|    294|        5|2014|
|40990|      Dorival Júnior|    275|        6|2014|
|41126|        Rodrigo Gral|    315|        5|2014|
|41218|              Magrão|    292|        1|2014|
|41327|Vande

## Partidas_2014

In [141]:
#Carrega arquivo CSV
partidas_2014_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2014/2014_partidas.csv", header=True)


In [142]:
partidas_2014_df.show(5)

+---+----+-----+------------------+---------------+-----+----------------+--------------------+----+
|_c0|game|round|              date|      home_team|score|       away_team|               arena|   X|
+---+----+-----+------------------+---------------+-----+----------------+--------------------+----+
|  1|   1|    1|20/04/2014 - 18:30|  Flamengo - RJ|0 x 0|      Goiás - GO|Mané Garrincha - ...|null|
|  2|   2|    1|19/04/2014 - 18:30|Fluminense - RJ|3 x 0|Figueirense - SC|Maracanã - Rio de...|null|
|  3|   3|    1|20/04/2014 - 16:00| São Paulo - SP|3 x 0|   Botafogo - RJ|Morumbi - Sao Pau...|null|
|  4|   4|    1|20/04/2014 - 18:30|    Santos - SP|1 x 1|      Sport - PE|Vila Belmiro - Sa...|null|
|  5|   5|    1|20/04/2014 - 16:00|  Atletico - PR|1 x 0|     Grêmio - RS|Orlando Scarpelli...|null|
+---+----+-----+------------------+---------------+-----+----------------+--------------------+----+
only showing top 5 rows



In [143]:
# Criar nome do time com a string antes do Hífen
partidas_2014_ct = partidas_2014_df.withColumn('away_team', remove_hiphen_udf(partidas_2014_df['away_team']))

partidas_2014_ct = partidas_2014_ct.withColumn('away_team', lower(col('away_team')))

partidas_2014_ct = partidas_2014_ct.withColumn('away_team', trim(col('away_team')))

partidas_2014_ct = partidas_2014_ct.withColumn('home_team', remove_hiphen_udf(partidas_2014_ct['home_team']))

partidas_2014_ct = partidas_2014_ct.withColumn('home_team', lower(col('home_team')))

partidas_2014_ct = partidas_2014_ct.withColumn('home_team', trim(col('home_team')))

partidas_2014_ct = partidas_2014_ct.withColumn('home_score', substring('score', 1,1))\

partidas_2014_ct = partidas_2014_ct.withColumn('away_score', substring('score', 5, 5))\

partidas_2014_ct = partidas_2014_ct.withColumn('home_score', partidas_2014_ct['home_score'].cast(IntegerType()))

partidas_2014_ct = partidas_2014_ct.withColumn('away_score', partidas_2014_ct['away_score'].cast(IntegerType()))

partidas_2014_ct = partidas_2014_ct.withColumn('total_gols', partidas_2014_ct['away_score'] + partidas_2014_ct['home_score'] )

partidas_2014_ct = partidas_2014_ct.withColumn('year', lit(2014))

time_ganhador = expr(
    """IF(home_score > away_score, home_team, IF(home_score = away_score, 'empate', away_team))"""
)

partidas_2014_ct = partidas_2014_ct.withColumn('result', time_ganhador)

partidas_2014_ct.toPandas()

Unnamed: 0,_c0,game,round,date,home_team,score,away_team,arena,X,home_score,away_score,total_gols,year,result
0,1,1,1,20/04/2014 - 18:30,flamengo,0 x 0,goiás,Mané Garrincha - Brasilia - DF,,0,0,0,2014,empate
1,2,2,1,19/04/2014 - 18:30,fluminense,3 x 0,figueirense,Maracanã - Rio de Janeiro - RJ,,3,0,3,2014,fluminense
2,3,3,1,20/04/2014 - 16:00,são paulo,3 x 0,botafogo,Morumbi - Sao Paulo - SP,,3,0,3,2014,são paulo
3,4,4,1,20/04/2014 - 18:30,santos,1 x 1,sport,Vila Belmiro - Santos - SP,,1,1,2,2014,empate
4,5,5,1,20/04/2014 - 16:00,atletico - pr,1 x 0,grêmio,Orlando Scarpelli - Florianopolis - SC,,1,0,1,2014,atletico - pr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,376,376,38,07/12/2014 - 17:00,cruzeiro,2 x 1,fluminense,Mineirão - Belo Horizonte - MG,,2,1,3,2014,cruzeiro
376,377,377,38,07/12/2014 - 17:00,vitória,0 x 1,santos,Manoel Barradas - Salvador - BA,,0,1,1,2014,santos
377,378,378,38,07/12/2014 - 17:00,grêmio,1 x 1,flamengo,Arena do Grêmio - Porto Alegre - RS,,1,1,2,2014,empate
378,379,379,38,06/12/2014 - 16:30,figueirense,1 x 2,internacional,Orlando Scarpelli - Florianopolis - SC,,1,2,3,2014,internacional


In [144]:
# Removendo colunas
partidas_2014_ct = partidas_2014_ct.drop('_c0', 'X', 'game_id')
# Criando ID para DF
# Partidas_2014_ct = partidas_2014_ct.withColumn('game_id', monotonically_increasing_id())
partidas_2014_ct.toPandas()

Unnamed: 0,game,round,date,home_team,score,away_team,arena,home_score,away_score,total_gols,year,result
0,1,1,20/04/2014 - 18:30,flamengo,0 x 0,goiás,Mané Garrincha - Brasilia - DF,0,0,0,2014,empate
1,2,1,19/04/2014 - 18:30,fluminense,3 x 0,figueirense,Maracanã - Rio de Janeiro - RJ,3,0,3,2014,fluminense
2,3,1,20/04/2014 - 16:00,são paulo,3 x 0,botafogo,Morumbi - Sao Paulo - SP,3,0,3,2014,são paulo
3,4,1,20/04/2014 - 18:30,santos,1 x 1,sport,Vila Belmiro - Santos - SP,1,1,2,2014,empate
4,5,1,20/04/2014 - 16:00,atletico - pr,1 x 0,grêmio,Orlando Scarpelli - Florianopolis - SC,1,0,1,2014,atletico - pr
...,...,...,...,...,...,...,...,...,...,...,...,...
375,376,38,07/12/2014 - 17:00,cruzeiro,2 x 1,fluminense,Mineirão - Belo Horizonte - MG,2,1,3,2014,cruzeiro
376,377,38,07/12/2014 - 17:00,vitória,0 x 1,santos,Manoel Barradas - Salvador - BA,0,1,1,2014,santos
377,378,38,07/12/2014 - 17:00,grêmio,1 x 1,flamengo,Arena do Grêmio - Porto Alegre - RS,1,1,2,2014,empate
378,379,38,06/12/2014 - 16:30,figueirense,1 x 2,internacional,Orlando Scarpelli - Florianopolis - SC,1,2,3,2014,internacional


In [145]:
partidas_2014_ct.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/partidas/')

In [146]:
#transforma arquivo em parquet
partidas_2014_ct = spark.read.option('basePath', '/cartola/clean/partidas/').parquet('/cartola/clean/partidas/*')

In [148]:
partidas_2014_ct.toPandas()


Unnamed: 0,game,round,date,home_team,score,away_team,arena,home_score,away_score,total_gols,result,year
0,1,1,20/04/2014 - 18:30,flamengo,0 x 0,goiás,Mané Garrincha - Brasilia - DF,0,0,0,empate,2014
1,2,1,19/04/2014 - 18:30,fluminense,3 x 0,figueirense,Maracanã - Rio de Janeiro - RJ,3,0,3,fluminense,2014
2,3,1,20/04/2014 - 16:00,são paulo,3 x 0,botafogo,Morumbi - Sao Paulo - SP,3,0,3,são paulo,2014
3,4,1,20/04/2014 - 18:30,santos,1 x 1,sport,Vila Belmiro - Santos - SP,1,1,2,empate,2014
4,5,1,20/04/2014 - 16:00,atletico - pr,1 x 0,grêmio,Orlando Scarpelli - Florianopolis - SC,1,0,1,atletico - pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...
375,376,38,07/12/2014 - 17:00,cruzeiro,2 x 1,fluminense,Mineirão - Belo Horizonte - MG,2,1,3,cruzeiro,2014
376,377,38,07/12/2014 - 17:00,vitória,0 x 1,santos,Manoel Barradas - Salvador - BA,0,1,1,santos,2014
377,378,38,07/12/2014 - 17:00,grêmio,1 x 1,flamengo,Arena do Grêmio - Porto Alegre - RS,1,1,2,empate,2014
378,379,38,06/12/2014 - 16:30,figueirense,1 x 2,internacional,Orlando Scarpelli - Florianopolis - SC,1,2,3,internacional,2014


## Times

In [153]:
times_2014_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2014/2014_times.csv", header=True)
sorted_times_2014_df = times_2014_df.sort(times_2014_df.ID.asc())

#Chegar se o arquivo está no hdfs

times_2014_ano = times_2014_df.withColumn('year', lit(2014))
times_2014_ano.toPandas()

Unnamed: 0,ID,Nome,Abreviacao,Slug,year
0,262,flamengo,FLA,flamengo,2014
1,263,botafogo,BOT,botafogo,2014
2,264,corinthians,COR,corinthians,2014
3,265,bahia,BAH,bahia,2014
4,266,fluminense,FLU,fluminense,2014
5,275,palmeiras,PAL,palmeiras,2014
6,276,são paulo,SAO,sao-paulo,2014
7,277,santos,SAN,santos,2014
8,282,atlético-mg,CAM,atletico-mg,2014
9,283,cruzeiro,CRU,cruzeiro,2014


In [154]:
times_2014_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/times/')
#transforma arquivo em parquet
times_2014_parquet = spark.read.option('basePath', '/cartola/clean/times/').parquet('/cartola/clean/times/*')

In [155]:
times_2014_parquet.toPandas()
times_2014_parquet.schema

#Alterar ID para int.(está string)

StructType(List(StructField(ID,StringType,true),StructField(Nome,StringType,true),StructField(Abreviacao,StringType,true),StructField(Slug,StringType,true),StructField(year,IntegerType,true)))

In [156]:
times_2014_parquet.schema

StructType(List(StructField(ID,StringType,true),StructField(Nome,StringType,true),StructField(Abreviacao,StringType,true),StructField(Slug,StringType,true),StructField(year,IntegerType,true)))

In [157]:
times_2014_parquet.toPandas()

Unnamed: 0,ID,Nome,Abreviacao,Slug,year
0,262,flamengo,FLA,flamengo,2014
1,263,botafogo,BOT,botafogo,2014
2,264,corinthians,COR,corinthians,2014
3,265,bahia,BAH,bahia,2014
4,266,fluminense,FLU,fluminense,2014
5,275,palmeiras,PAL,palmeiras,2014
6,276,são paulo,SAO,sao-paulo,2014
7,277,santos,SAN,santos,2014
8,282,atlético-mg,CAM,atletico-mg,2014
9,283,cruzeiro,CRU,cruzeiro,2014


## scouts_raw

In [190]:
# Analisando o arquivo 2014_lances.csv vimos que não é necessário processar esse arquivo, pois as informações relevantes estão em scouts_raw.

In [191]:
scouts_raw_2014_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2014/2014_scouts_raw.csv", header=True)

In [192]:
scouts_raw_2014_ano = scouts_raw_2014_df.withColumn('year', lit(2014))

In [193]:
scouts_raw_2014_ano = scouts_raw_2014_ano.withColumn("Pontos", scouts_raw_2014_ano["Pontos"].cast(FloatType()))

In [194]:
scouts_raw_2014_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/scouts/')
#transforma arquivo em parquet
scouts_2014_parquet = spark.read.option('basePath', '/cartola/clean/scouts/').parquet('/cartola/clean/scouts/*')

In [195]:
scouts_raw_2014_ano.toPandas()

Unnamed: 0,Atleta,Rodada,Clube,Participou,Posicao,Jogos,Pontos,PontosMedia,Preco,PrecoVariacao,...,RB,FC,GC,CA,CV,SG,DD,DP,GS,year
0,36443,0,,0,,0,0.0,0,9,0,...,0,0,0,0,0,0,0,0,0,2014
1,36443,1,285,1,1,1,5.0,5,10.6,1.6,...,0,0,0,0,0,1,0,0,0,2014
2,36443,2,285,1,1,2,-3.0,1,8.27,-2.33,...,0,0,0,0,0,0,0,0,2,2014
3,36443,3,285,1,1,3,-2.6,-0.2,6.81,-1.46,...,0,0,0,0,0,0,0,0,1,2014
4,36443,4,285,1,1,4,4.0,0.85,7.96,1.15,...,0,0,0,0,0,0,2,0,1,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31562,89924,38,,0,,1,0.0,-0.8,0.81,0,...,0,1,0,0,0,0,0,0,0,2014
31563,84794,38,293,1,4,1,0.9,0.9,1.24,0.24,...,1,1,0,0,0,0,0,0,0,2014
31564,89815,38,285,1,4,2,1.0,0.25,1,0.19,...,0,0,0,0,0,0,0,0,0,2014
31565,82639,38,284,0,4,0,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2014


## DEMONSTRAÇÃO

In [None]:
partidas_2014_df = spark.read.csv("/cartola/data/2014/2014_jogadores.csv", header=True)
partidas_2014_ct = partidas_2014_df.withColumn('time', regexp_replace('home_team', ' - RJ', ''))
final_partidas = partidas_2014_ct.withColumn('time_low', lower(col('time'))).show(truncate=False)

In [None]:
with_ano_partidas = partidas_2014_ct.withColumn('ano', lit(2014)).show(truncate=False)
with_ano_partidas.show()

In [None]:
Comando para sobrescrever arquivo caso já existente.
Agrupar scouts por ID e contar (caso tenha duplicidade)

In [85]:
inner_join = partidas_ids_2014_ano.join(times_2014_ano, partidas_ids_2014_ano.Casa == times_2014_ano.ID)
inner_join.toPandas()

Unnamed: 0,ID,Rodada,Casa,Visitante,PlacarCasa,PlacarVisitante,Resultado,ano,ID.1,Nome,Abreviacao,Slug,ano.1
0,179872,1,262,290,0,0,Empate,2014,262,flamengo,FLA,flamengo,2014
1,179873,1,266,316,3,0,Casa,2014,266,fluminense,FLU,fluminense,2014
2,179874,1,276,263,3,0,Casa,2014,276,são paulo,SAO,sao-paulo,2014
3,179875,1,277,292,1,1,Empate,2014,277,santos,SAN,santos,2014
4,179876,1,293,284,1,0,Casa,2014,293,atlético-pr,CAP,atletico-pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,180250,29,282,315,1,0,Casa,2014,282,atlético-mg,CAM,atletico-mg,2014
376,180251,29,287,283,0,1,Visitante,2014,287,vitória,VIT,vitoria,2014
377,180252,29,285,264,1,2,Visitante,2014,285,internacional,INT,internacional,2014
378,180253,29,316,294,4,0,Casa,2014,316,figueirense,FIG,figueirense,2014


In [None]:
scouts_atletas = pontos_por_atleta.join(jogadores_2014_parquet, pontos_por_atleta.Atleta == jogadores_2014_parquet.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

In [None]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2014_ano.groupBy("Atleta").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()