## Limpeza de dados Cartola ano 2015

In [1]:
from pyspark.sql import HiveContext
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import lower, col, lit, regexp_replace, trim, substring, when, expr, udf, count, sum, monotonically_increasing_id
import pandas as pd
import json
import requests

# Confirguração para não sobrescrever DF
spark.conf.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')

## Funções Custonmizadas

In [2]:
def remove_after_hiphen(team_name):
    if team_name.startswith('atletico') or team_name.startswith('atl')  or team_name.startswith('Atl') or team_name.startswith('Ath'):
        return team_name
    else:
        return team_name.split('-', 1)[0]

In [3]:
#Remove dos valores das Strings o que estiver após os hiphen
remove_hiphen_udf = udf(remove_after_hiphen, StringType())
#partidas_2014_ct = partidas_2014_df.withColumn('away_team', remove_hiphen_udf(partidas_2014_df['away_team']))

## Jogadores

In [28]:
#Carrega arquivo CSV
jogadores_2015_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2015/2015_jogadores.csv", header=True)
sorted_jogadores_2015_df = jogadores_2015_df.sort(jogadores_2015_df.ID.asc())


In [29]:
#Adiciona coluna ANO = 2014
jogadores_2015_ano = sorted_jogadores_2015_df.withColumn('year', lit(2015))
jogadores_2015_ano.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,36443,Dida,285,1,2015
1,36540,Juan,285,3,2015
2,36591,Leonardo,277,3,2015
3,36612,Zé Roberto,275,2,2015
4,36650,Magno Alves,266,5,2015
...,...,...,...,...,...
1021,93317,Diogo Giacomini,,6,2015
1022,93368,Lucas Veríssimo,,3,2015
1023,93376,Wesley Souza,,4,2015
1024,93377,Jobson,,4,2015


In [30]:
jogadores_2015_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/jogadores/')

In [31]:
#transforma arquivo em parquet
jogadores_2015_parquet = spark.read.option('basePath', '/cartola/clean/jogadores/').parquet('/cartola/clean/jogadores/*')

In [32]:
jogadores_2015_parquet.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,54797,Túlio De Melo,,5,2015
1,55519,Edmílson,315,5,2015
2,56102,João Paulo,264,4,2015
3,60752,Cristóvão Borges,,6,2015
4,60780,Vinícius Eutrópio,315,6,2015
...,...,...,...,...,...
2054,89342,Caju,277,2,2015
2055,89343,Júnior,284,2,2015
2056,89444,Hugo Ragelli,,5,2015
2057,89445,Yuri,315,5,2015


In [35]:
print(jogadores_2015_parquet.count())

2059


## Partidas_2015

In [36]:
#Carrega arquivo CSV
partidas_2015_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2015/2015_partidas.csv", header=True)


In [37]:
partidas_2015_df.show(5)

+---+----+-----+------------------+----------------+-----+----------------+--------------------+----+
|_c0|game|round|              date|       home_team|score|       away_team|               arena|   X|
+---+----+-----+------------------+----------------+-----+----------------+--------------------+----+
|  1|   1|    1|09/05/2015 - 18:30|  Palmeiras - SP|2 x 2|   Atlético - MG|Allianz Parque - ...|null|
|  2|   2|    1|09/05/2015 - 18:30|Chapecoense - SC|2 x 1|   Coritiba - PR|Arena Condá - Cha...|null|
|  3|   3|    1|09/05/2015 - 21:00| Fluminense - RJ|1 x 0|  Joinville - SC|Maracanã - Rio de...|null|
|  4|   4|    1|10/05/2015 - 16:00|  São Paulo - SP|2 x 1|   Flamengo - RJ|Morumbi - Sao Pau...|null|
|  5|   5|    1|10/05/2015 - 16:00|   Cruzeiro - MG|0 x 1|Corinthians - SP|Arena Pantanal - ...|null|
+---+----+-----+------------------+----------------+-----+----------------+--------------------+----+
only showing top 5 rows



In [39]:
# Criar nome do time com a string antes do Hífen
partidas_2015_ct = partidas_2015_df.withColumn('away_team', remove_hiphen_udf(partidas_2015_df['away_team']))

partidas_2015_ct = partidas_2015_ct.withColumn('away_team', lower(col('away_team')))

partidas_2015_ct = partidas_2015_ct.withColumn('away_team', trim(col('away_team')))

partidas_2015_ct = partidas_2015_ct.withColumn('home_team', remove_hiphen_udf(partidas_2015_ct['home_team']))

partidas_2015_ct = partidas_2015_ct.withColumn('home_team', lower(col('home_team')))

partidas_2015_ct = partidas_2015_ct.withColumn('home_team', trim(col('home_team')))

partidas_2015_ct = partidas_2015_ct.withColumn('home_score', substring('score', 1,1))\

partidas_2015_ct = partidas_2015_ct.withColumn('away_score', substring('score', 5, 5))\

partidas_2015_ct = partidas_2015_ct.withColumn('home_score', partidas_2015_ct['home_score'].cast(IntegerType()))

partidas_2015_ct = partidas_2015_ct.withColumn('away_score', partidas_2015_ct['away_score'].cast(IntegerType()))

partidas_2015_ct = partidas_2015_ct.withColumn('total_gols', partidas_2015_ct['away_score'] + partidas_2015_ct['home_score'] )

partidas_2015_ct = partidas_2015_ct.withColumn('year', lit(2015))

time_ganhador = expr(
    """IF(home_score > away_score, home_team, IF(home_score = away_score, 'empate', away_team))"""
)

partidas_2015_ct = partidas_2015_ct.withColumn('result', time_ganhador)

partidas_2015_ct.toPandas()

Unnamed: 0,_c0,game,round,date,home_team,score,away_team,arena,X,home_score,away_score,total_gols,year,result
0,1,1,1,09/05/2015 - 18:30,palmeiras,2 x 2,atlético - mg,Allianz Parque - Sao Paulo - SP,,2,2,4,2015,empate
1,2,2,1,09/05/2015 - 18:30,chapecoense,2 x 1,coritiba,Arena Condá - Chapeco - SC,,2,1,3,2015,chapecoense
2,3,3,1,09/05/2015 - 21:00,fluminense,1 x 0,joinville,Maracanã - Rio de Janeiro - RJ,,1,0,1,2015,fluminense
3,4,4,1,10/05/2015 - 16:00,são paulo,2 x 1,flamengo,Morumbi - Sao Paulo - SP,,2,1,3,2015,são paulo
4,5,5,1,10/05/2015 - 16:00,cruzeiro,0 x 1,corinthians,Arena Pantanal - Cuiaba - MT,,0,1,1,2015,corinthians
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,376,376,38,06/12/2015 - 17:00,atlético - mg,3 x 0,chapecoense,Mineirão - Belo Horizonte - MG,,3,0,3,2015,atlético - mg
376,377,377,38,06/12/2015 - 17:00,figueirense,1 x 0,fluminense,Orlando Scarpelli - Florianopolis - SC,,1,0,1,2015,figueirense
377,378,378,38,06/12/2015 - 17:00,coritiba,0 x 0,vasco da gama,Couto Pereira - Curitiba - PR,,0,0,0,2015,empate
378,379,379,38,06/12/2015 - 17:00,goiás,0 x 1,são paulo,Serra Dourada - Goiania - GO,,0,1,1,2015,são paulo


In [42]:
# Removendo colunas
partidas_2015_ct = partidas_2015_ct.drop('_c0', 'X', 'game_id')
# Criando ID para DF
# partidas_2015_ct = partidas_2015_ct.withColumn('game_id', monotonically_increasing_id())
partidas_2015_ct.toPandas()

Unnamed: 0,game,round,date,home_team,score,away_team,arena,home_score,away_score,total_gols,year,result
0,1,1,09/05/2015 - 18:30,palmeiras,2 x 2,atlético - mg,Allianz Parque - Sao Paulo - SP,2,2,4,2015,empate
1,2,1,09/05/2015 - 18:30,chapecoense,2 x 1,coritiba,Arena Condá - Chapeco - SC,2,1,3,2015,chapecoense
2,3,1,09/05/2015 - 21:00,fluminense,1 x 0,joinville,Maracanã - Rio de Janeiro - RJ,1,0,1,2015,fluminense
3,4,1,10/05/2015 - 16:00,são paulo,2 x 1,flamengo,Morumbi - Sao Paulo - SP,2,1,3,2015,são paulo
4,5,1,10/05/2015 - 16:00,cruzeiro,0 x 1,corinthians,Arena Pantanal - Cuiaba - MT,0,1,1,2015,corinthians
...,...,...,...,...,...,...,...,...,...,...,...,...
375,376,38,06/12/2015 - 17:00,atlético - mg,3 x 0,chapecoense,Mineirão - Belo Horizonte - MG,3,0,3,2015,atlético - mg
376,377,38,06/12/2015 - 17:00,figueirense,1 x 0,fluminense,Orlando Scarpelli - Florianopolis - SC,1,0,1,2015,figueirense
377,378,38,06/12/2015 - 17:00,coritiba,0 x 0,vasco da gama,Couto Pereira - Curitiba - PR,0,0,0,2015,empate
378,379,38,06/12/2015 - 17:00,goiás,0 x 1,são paulo,Serra Dourada - Goiania - GO,0,1,1,2015,são paulo


In [43]:
partidas_2015_ct.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/partidas/')

In [44]:
#transforma arquivo em parquet
partidas_2015_ct = spark.read.option('basePath', '/cartola/clean/partidas/').parquet('/cartola/clean/partidas/*')

In [45]:
partidas_2015_ct.toPandas()


Unnamed: 0,game,round,date,home_team,score,away_team,arena,home_score,away_score,total_gols,result,year
0,1,1,20/04/2014 - 18:30,flamengo,0 x 0,goiás,Mané Garrincha - Brasilia - DF,0,0,0,empate,2014
1,2,1,19/04/2014 - 18:30,fluminense,3 x 0,figueirense,Maracanã - Rio de Janeiro - RJ,3,0,3,fluminense,2014
2,3,1,20/04/2014 - 16:00,são paulo,3 x 0,botafogo,Morumbi - Sao Paulo - SP,3,0,3,são paulo,2014
3,4,1,20/04/2014 - 18:30,santos,1 x 1,sport,Vila Belmiro - Santos - SP,1,1,2,empate,2014
4,5,1,20/04/2014 - 16:00,atletico - pr,1 x 0,grêmio,Orlando Scarpelli - Florianopolis - SC,1,0,1,atletico - pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...
755,376,38,06/12/2015 - 17:00,atlético - mg,3 x 0,chapecoense,Mineirão - Belo Horizonte - MG,3,0,3,atlético - mg,2015
756,377,38,06/12/2015 - 17:00,figueirense,1 x 0,fluminense,Orlando Scarpelli - Florianopolis - SC,1,0,1,figueirense,2015
757,378,38,06/12/2015 - 17:00,coritiba,0 x 0,vasco da gama,Couto Pereira - Curitiba - PR,0,0,0,empate,2015
758,379,38,06/12/2015 - 17:00,goiás,0 x 1,são paulo,Serra Dourada - Goiania - GO,0,1,1,são paulo,2015


## Times

In [46]:
times_2015_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2015/2015_times.csv", header=True)
sorted_times_2015_df = times_2015_df.sort(times_2015_df.ID.asc())

#Chegar se o arquivo está no hdfs

times_2015_ano = times_2015_df.withColumn('year', lit(2015))
times_2015_ano.toPandas()

Unnamed: 0,ID,Nome,Abreviacao,Slug,year
0,262,Flamengo,fla,flamengo,2015
1,264,Corinthians,cor,corinthians,2015
2,266,Fluminense,flu,fluminense,2015
3,267,Vasco,vas,vasco,2015
4,275,Palmeiras,pal,palmeiras,2015
5,276,São Paulo,sao,sao-paulo,2015
6,277,Santos,san,santos,2015
7,282,Atlético-MG,cam,atletico-mg,2015
8,283,Cruzeiro,cru,cruzeiro,2015
9,284,Grêmio,gre,gremio,2015


In [47]:
times_2015_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/times/')
#transforma arquivo em parquet
times_2015_parquet = spark.read.option('basePath', '/cartola/clean/times/').parquet('/cartola/clean/times/*')

In [49]:
times_2015_parquet.toPandas()

Unnamed: 0,ID,Nome,Abreviacao,Slug,year
0,262,flamengo,FLA,flamengo,2014
1,263,botafogo,BOT,botafogo,2014
2,264,corinthians,COR,corinthians,2014
3,265,bahia,BAH,bahia,2014
4,266,fluminense,FLU,fluminense,2014
5,275,palmeiras,PAL,palmeiras,2014
6,276,são paulo,SAO,sao-paulo,2014
7,277,santos,SAN,santos,2014
8,282,atlético-mg,CAM,atletico-mg,2014
9,283,cruzeiro,CRU,cruzeiro,2014


## scouts_raw

In [65]:
# Analisando o arquivo 2014_lances.csv vimos que não é necessário processar esse arquivo, pois as informações relevantes estão em scouts_raw.

In [66]:
scouts_raw_2015_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2015/2015_scouts_raw.csv", header=True)

In [67]:
scouts_raw_2015_ano = scouts_raw_2015_df.withColumn('year', lit(2015))

In [68]:
scouts_raw_2015_ano = scouts_raw_2015_ano.withColumn("Pontos", scouts_raw_2015_ano["Pontos"].cast(FloatType()))

In [69]:
scouts_raw_2015_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/scouts/')
#transforma arquivo em parquet
scouts_2015_parquet = spark.read.option('basePath', '/cartola/clean/scouts/').parquet('/cartola/clean/scouts/*')

In [70]:
scouts_2015_parquet.toPandas()

Unnamed: 0,AtletaID,Rodada,ClubeID,Participou,Posicao,Jogos,Pontos,PontosMedia,Preco,PrecoVariacao,...,RB,FC,GC,CA,CV,SG,DD,DP,GS,year
0,81219,0,262,,,0,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2015
1,88072,0,262,,,0,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2015
2,89258,0,262,,,0,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2015
3,91263,0,262,,,0,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2015
4,74103,0,262,,,0,0.0,0,2,0,...,0,0,0,0,0,0,0,0,0,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62072,89924,38,,0,,1,0.0,-0.8,0.81,0,...,0,1,0,0,0,0,0,0,0,2014
62073,84794,38,293,1,4,1,0.9,0.9,1.24,0.24,...,1,1,0,0,0,0,0,0,0,2014
62074,89815,38,285,1,4,2,1.0,0.25,1,0.19,...,0,0,0,0,0,0,0,0,0,2014
62075,82639,38,284,0,4,0,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2014


In [76]:
scouts_raw_2015_ano.schema

StructType(List(StructField(Rodada,StringType,true),StructField(ClubeID,StringType,true),StructField(AtletaID,StringType,true),StructField(Jogos,StringType,true),StructField(Pontos,FloatType,true),StructField(PontosMedia,StringType,true),StructField(Preco,StringType,true),StructField(PrecoVariacao,StringType,true),StructField(FS,StringType,true),StructField(PE,StringType,true),StructField(A,StringType,true),StructField(FT,StringType,true),StructField(FD,StringType,true),StructField(FF,StringType,true),StructField(G,StringType,true),StructField(I,StringType,true),StructField(PP,StringType,true),StructField(RB,StringType,true),StructField(FC,StringType,true),StructField(GC,StringType,true),StructField(CA,StringType,true),StructField(CV,StringType,true),StructField(SG,StringType,true),StructField(DD,StringType,true),StructField(DP,StringType,true),StructField(GS,StringType,true),StructField(year,IntegerType,false)))

In [77]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2015_ano[scouts_raw_2015_ano['year'] == 2015]

pontos_por_atleta = pontos_por_atleta.groupBy("AtletaID").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()

Unnamed: 0,AtletaID,SomaPontos
0,68872,201.100000
1,69051,189.900000
2,38315,171.099999
3,42234,169.899999
4,86759,167.500000
...,...,...
1021,63082,-7.000000
1022,50294,-7.400000
1023,36802,-7.500000
1024,68922,-11.800000


In [81]:
scouts_atletas = pontos_por_atleta.join(jogadores_2015_parquet, pontos_por_atleta.AtletaID == jogadores_2015_parquet.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

Unnamed: 0,AtletaID,SomaPontos,ID,Apelido,ClubeID,PosicaoID,year
0,68872,201.100000,68872,Marcelo Lomba,265,1,2014
1,68872,201.100000,68872,Marcelo Lomba,303,1,2015
2,69051,189.900000,69051,Danilo Fernandes,264,1,2014
3,69051,189.900000,69051,Danilo Fernandes,292,1,2015
4,38315,171.099999,38315,Jadson,264,4,2014
...,...,...,...,...,...,...,...
1580,36802,-7.500000,36802,Ronaldinho Gaúcho,282,4,2014
1581,68922,-11.800000,68922,Fabrício,,4,2015
1582,68922,-11.800000,68922,Fabrício,276,4,2014
1583,38074,-13.500000,38074,Ygor,285,4,2014


## DEMONSTRAÇÃO

In [None]:
partidas_2014_df = spark.read.csv("/cartola/data/2014/2014_jogadores.csv", header=True)
partidas_2014_ct = partidas_2014_df.withColumn('time', regexp_replace('home_team', ' - RJ', ''))
final_partidas = partidas_2014_ct.withColumn('time_low', lower(col('time'))).show(truncate=False)

In [None]:
with_ano_partidas = partidas_2014_ct.withColumn('ano', lit(2014)).show(truncate=False)
with_ano_partidas.show()

In [None]:
Comando para sobrescrever arquivo caso já existente.
Agrupar scouts por ID e contar (caso tenha duplicidade)

In [85]:
inner_join = partidas_ids_2014_ano.join(times_2014_ano, partidas_ids_2014_ano.Casa == times_2014_ano.ID)
inner_join.toPandas()

Unnamed: 0,ID,Rodada,Casa,Visitante,PlacarCasa,PlacarVisitante,Resultado,ano,ID.1,Nome,Abreviacao,Slug,ano.1
0,179872,1,262,290,0,0,Empate,2014,262,flamengo,FLA,flamengo,2014
1,179873,1,266,316,3,0,Casa,2014,266,fluminense,FLU,fluminense,2014
2,179874,1,276,263,3,0,Casa,2014,276,são paulo,SAO,sao-paulo,2014
3,179875,1,277,292,1,1,Empate,2014,277,santos,SAN,santos,2014
4,179876,1,293,284,1,0,Casa,2014,293,atlético-pr,CAP,atletico-pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,180250,29,282,315,1,0,Casa,2014,282,atlético-mg,CAM,atletico-mg,2014
376,180251,29,287,283,0,1,Visitante,2014,287,vitória,VIT,vitoria,2014
377,180252,29,285,264,1,2,Visitante,2014,285,internacional,INT,internacional,2014
378,180253,29,316,294,4,0,Casa,2014,316,figueirense,FIG,figueirense,2014


In [None]:
scouts_atletas = pontos_por_atleta.join(jogadores_2014_parquet, pontos_por_atleta.Atleta == jogadores_2014_parquet.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

In [None]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2014_ano.groupBy("Atleta").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()