## Limpeza de dados Cartola ano 2018

In [1]:
from pyspark.sql import HiveContext
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.sql.functions import lower, col, lit, regexp_replace, trim, substring, when, expr, udf, count, sum, monotonically_increasing_id
import pandas as pd
import json
import requests

# Confirguração para não sobrescrever DF
spark.conf.set('spark.sql.sources.partitionOverwriteMode', 'dynamic')

## Funções Custonmizadas

In [2]:
def remove_after_hiphen(team_name):
    if team_name.startswith('atletico') or team_name.startswith('atl')  or team_name.startswith('Atl') or team_name.startswith('Ath'):
        return team_name
    else:
        return team_name.split('-', 1)[0]

In [3]:
#Remove dos valores das Strings o que estiver após os hiphen
remove_hiphen_udf = udf(remove_after_hiphen, StringType())
#partidas_2014_ct = partidas_2014_df.withColumn('away_team', remove_hiphen_udf(partidas_2014_df['away_team']))

## Jogadores

In [22]:
#Carrega arquivo CSV Jogadores
jogadores_2018_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2018/2018-medias-jogadores.csv", header=True)
sorted_jogadores_2018_df = jogadores_2018_df.sort(jogadores_2018_df.player_id.asc())


In [23]:
#Carrega arquivo Posicoes_ID
posicoes_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/posicoes_ids.csv", header=True)
posicoes_df.toPandas()


Unnamed: 0,_c0,Cod,Position,abbr
0,1,1,Goleiro,gol
1,2,2,Lateral,lat
2,3,3,Zagueiro,zag
3,4,4,Meia,mei
4,5,5,Atacante,ata
5,6,6,Técnico,tec


In [24]:
#Adiciona coluna ANO = 2018
jogadores_2018_ano = sorted_jogadores_2018_df.withColumn('year', lit(2018))
jogadores_2018_ano.toPandas()

Unnamed: 0,player_slug,player_id,player_nickname,player_team,player_position,price_cartoletas,score_mean,score_no_cleansheets_mean,diff_home_away_s,n_games,...,RB_mean,PE_mean,A_mean,I_mean,FS_mean,FF_mean,G_mean,DD_mean,DP_mean,year
0,rodrygo,100651,Rodrygo,277,ata,13.05,4.23939393939394,4.23939393939394,1.69872581129181,33,...,0.575757575757576,2.15151515151515,0.0909090909090909,0.0303030303030303,2,0.484848484848485,0.242424242424242,0,0,2018
1,matheus-henrique,100742,Matheus Henrique,284,mei,1.68,2.74,2.74,1.44156929300436,10,...,1.3,1.5,0,0,0.9,0.3,0.2,0,0,2018
2,liziero,100761,Liziero,276,lat,6.24,3.7,1.91428571428571,-0.554420343029042,28,...,1.25,1.10714285714286,0.0357142857142857,0.0357142857142857,0.964285714285714,0.285714285714286,0.0357142857142857,0,0,2018
3,ademir,101314,Ademir,327,ata,3.42,3.23076923076923,3.23076923076923,0.853530696680107,13,...,0.923076923076923,1.38461538461538,0.0769230769230769,0.230769230769231,2.46153846153846,0.615384615384615,0.0769230769230769,0,0,2018
4,bruno-silva,101422,Bruno Silva,315,ata,2.91,2.03666666666667,2.03666666666667,0.901966308217755,30,...,1.06666666666667,1.86666666666667,0.0666666666666667,0.166666666666667,0.8,0.466666666666667,0.0333333333333333,0,0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,bremer,99817,Bremer,282,zag,6.26,3.71,2.71,-0.0205807302883811,10,...,2,1.8,0,0,0.4,0.1,0.1,0,0,2018
422,ricardo,99881,Ricardo,267,zag,4.49,1.86153846153846,0.323076923076923,-0.709274581839311,13,...,1.15384615384615,1.15384615384615,0,0.153846153846154,0.692307692307692,0,0,0,0,2018
423,chara,99891,Chará,282,ata,10.47,3.98695652173913,3.98695652173913,0.427681539951869,25,...,1.1304347826087,2,0.304347826086957,0.173913043478261,2.17391304347826,0.434782608695652,0.0434782608695652,0,0,2018
424,junior-brumado,99894,Júnior Brumado,265,ata,3.18,1.125,1.125,0.0942015783034407,12,...,0.416666666666667,0.666666666666667,0,0.5,0.75,0.833333333333333,0.0833333333333333,0,0,2018


In [25]:
jogadores_2018_ano = jogadores_2018_ano['player_id', 'player_nickname', 'player_team', 'player_position']
jogadores_2018_ano.toPandas()

Unnamed: 0,player_id,player_nickname,player_team,player_position
0,100651,Rodrygo,277,ata
1,100742,Matheus Henrique,284,mei
2,100761,Liziero,276,lat
3,101314,Ademir,327,ata
4,101422,Bruno Silva,315,ata
...,...,...,...,...
421,99817,Bremer,282,zag
422,99881,Ricardo,267,zag
423,99891,Chará,282,ata
424,99894,Júnior Brumado,265,ata


In [26]:
jogadores_posicoes = jogadores_2018_ano.join(posicoes_df, jogadores_2018_ano.player_position == posicoes_df.abbr)
jogadores_posicoes.toPandas()

Unnamed: 0,player_id,player_nickname,player_team,player_position,_c0,Cod,Position,abbr
0,100651,Rodrygo,277,ata,5,5,Atacante,ata
1,100742,Matheus Henrique,284,mei,4,4,Meia,mei
2,100761,Liziero,276,lat,2,2,Lateral,lat
3,101314,Ademir,327,ata,5,5,Atacante,ata
4,101422,Bruno Silva,315,ata,5,5,Atacante,ata
...,...,...,...,...,...,...,...,...
421,99817,Bremer,282,zag,3,3,Zagueiro,zag
422,99881,Ricardo,267,zag,3,3,Zagueiro,zag
423,99891,Chará,282,ata,5,5,Atacante,ata
424,99894,Júnior Brumado,265,ata,5,5,Atacante,ata


In [27]:
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_id', 'ID')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_nickname', 'Apelido')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('player_team', 'ClubeID')
jogadores_posicoes = jogadores_posicoes.withColumnRenamed('Cod', 'PosicaoID')

In [28]:
# Removendo colunas
jogadores = jogadores_posicoes.drop('_c0', 'player_position', 'Position', 'abbr')

In [29]:
jogadores.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID
0,100651,Rodrygo,277,5
1,100742,Matheus Henrique,284,4
2,100761,Liziero,276,2
3,101314,Ademir,327,5
4,101422,Bruno Silva,315,5
...,...,...,...,...
421,99817,Bremer,282,3
422,99881,Ricardo,267,3
423,99891,Chará,282,5
424,99894,Júnior Brumado,265,5


In [30]:
jogadores_2018_ano = jogadores.withColumn('year', lit(2018))
jogadores_2018_ano.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,100651,Rodrygo,277,5,2018
1,100742,Matheus Henrique,284,4,2018
2,100761,Liziero,276,2,2018
3,101314,Ademir,327,5,2018
4,101422,Bruno Silva,315,5,2018
...,...,...,...,...,...
421,99817,Bremer,282,3,2018
422,99881,Ricardo,267,3,2018
423,99891,Chará,282,5,2018
424,99894,Júnior Brumado,265,5,2018


In [31]:
jogadores_2018_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/jogadores/')

In [32]:
#transforma arquivo em parquet
jogadores_2018_parquet = spark.read.option('basePath', '/cartola/clean/jogadores/').parquet('/cartola/clean/jogadores/*')

In [33]:
jogadores_2018_parquet.toPandas()

Unnamed: 0,ID,Apelido,ClubeID,PosicaoID,year
0,51683,Bruno Rangel,315,5,2014
1,51705,Bruno Rodrigo,283,3,2014
2,51772,Éverton Ribeiro,283,4,2014
3,51779,Pedro Botelho,282,2,2014
4,51781,Ávine,265,2,2014
...,...,...,...,...,...
4232,85930,Aylon,327,5,2018
4233,51792,Kanu,287,3,2018
4234,52190,Ralf,264,4,2018
4235,91607,Rony,293,5,2018


In [34]:
print(jogadores_2018_parquet.count())

4237


## Partidas_2018

In [35]:
#Carrega arquivo CSV
partidas_2018_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2018/2018_partidas.csv", header=True)

In [36]:
partidas_2018_df.show(5)

+----+-----+------------------+------------------+-----+----------------+--------------------+
|game|round|              date|         home_team|score|       away_team|               arena|
+----+-----+------------------+------------------+-----+----------------+--------------------+
|   1|    1|14/04/2018 - 16:00|     Cruzeiro - MG|0 x 1|     Grêmio - RS|Mineirão - Belo H...|
|   2|    1|15/04/2018 - 19:00|     Atlético - PR|5 x 1|Chapecoense - SC|Arena da Baixada ...|
|   3|    1|15/04/2018 - 11:00|      América - MG|3 x 0|      Sport - PE|Independência - B...|
|   4|    1|14/04/2018 - 19:00|      Vitória - BA|2 x 2|   Flamengo - RJ|Manoel Barradas -...|
|   5|    1|15/04/2018 - 16:00|Vasco da Gama - RJ|2 x 1|   Atlético - MG|São Januário - Ri...|
+----+-----+------------------+------------------+-----+----------------+--------------------+
only showing top 5 rows



In [37]:
# Criar nome do time com a string antes do Hífen
partidas_2018_ct = partidas_2018_df.withColumn('away_team', remove_hiphen_udf(partidas_2018_df['away_team']))

partidas_2018_ct = partidas_2018_ct.withColumn('away_team', lower(col('away_team')))

partidas_2018_ct = partidas_2018_ct.withColumn('away_team', trim(col('away_team')))

partidas_2018_ct = partidas_2018_ct.withColumn('home_team', remove_hiphen_udf(partidas_2018_ct['home_team']))

partidas_2018_ct = partidas_2018_ct.withColumn('home_team', lower(col('home_team')))

partidas_2018_ct = partidas_2018_ct.withColumn('home_team', trim(col('home_team')))

partidas_2018_ct = partidas_2018_ct.withColumn('home_score', substring('score', 1,1))\

partidas_2018_ct = partidas_2018_ct.withColumn('away_score', substring('score', 5, 5))\

partidas_2018_ct = partidas_2018_ct.withColumn('home_score', partidas_2018_ct['home_score'].cast(IntegerType()))

partidas_2018_ct = partidas_2018_ct.withColumn('away_score', partidas_2018_ct['away_score'].cast(IntegerType()))

partidas_2018_ct = partidas_2018_ct.withColumn('total_gols', partidas_2018_ct['away_score'] + partidas_2018_ct['home_score'] )

partidas_2018_ct = partidas_2018_ct.withColumn('year', lit(2018))

time_ganhador = expr(
    """IF(home_score > away_score, home_team, IF(home_score = away_score, 'empate', away_team))"""
)

partidas_2018_ct = partidas_2018_ct.withColumn('result', time_ganhador)

partidas_2018_ct.toPandas()

Unnamed: 0,game,round,date,home_team,score,away_team,arena,home_score,away_score,total_gols,year,result
0,1,1,14/04/2018 - 16:00,cruzeiro,0 x 1,grêmio,Mineirão - Belo Horizonte - MG,0,1.0,1.0,2018,grêmio
1,2,1,15/04/2018 - 19:00,atlético - pr,5 x 1,chapecoense,Arena da Baixada - Curitiba - PR,5,1.0,6.0,2018,atlético - pr
2,3,1,15/04/2018 - 11:00,américa,3 x 0,sport,Independência - Belo Horizonte - MG,3,0.0,3.0,2018,américa
3,4,1,14/04/2018 - 19:00,vitória,2 x 2,flamengo,Manoel Barradas - Salvador - BA,2,2.0,4.0,2018,empate
4,5,1,15/04/2018 - 16:00,vasco da gama,2 x 1,atlético - mg,São Januário - Rio de Janeiro - RJ,2,1.0,3.0,2018,vasco da gama
...,...,...,...,...,...,...,...,...,...,...,...,...
375,376,38,02/12/2018 - 17:00,fluminense,1 x 0,américa,Correto,1,0.0,1.0,2018,fluminense
376,377,38,02/12/2018 - 17:00,ceará,0 x 0,vasco da gama,Correto,0,0.0,0.0,2018,empate
377,378,38,02/12/2018 - 17:00,sport,2 x 1,santos,Correto,2,1.0,3.0,2018,sport
378,379,38,02/12/2018 - 17:00,palmeiras,3 x 2,vitória,Correto,3,2.0,5.0,2018,palmeiras


In [38]:
partidas_2018_ct.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/partidas/')

In [39]:
#transforma arquivo em parquet
partidas_2018_parquet = spark.read.option('basePath', '/cartola/clean/partidas/').parquet('/cartola/clean/partidas/*')

In [40]:
partidas_2018_parquet.toPandas()


Unnamed: 0,game,round,date,home_team,score,away_team,arena,home_score,away_score,total_gols,result,year
0,1,1,20/04/2014 - 18:30,flamengo,0 x 0,goiás,Mané Garrincha - Brasilia - DF,0.0,0.0,0.0,empate,2014
1,2,1,19/04/2014 - 18:30,fluminense,3 x 0,figueirense,Maracanã - Rio de Janeiro - RJ,3.0,0.0,3.0,fluminense,2014
2,3,1,20/04/2014 - 16:00,são paulo,3 x 0,botafogo,Morumbi - Sao Paulo - SP,3.0,0.0,3.0,são paulo,2014
3,4,1,20/04/2014 - 18:30,santos,1 x 1,sport,Vila Belmiro - Santos - SP,1.0,1.0,2.0,empate,2014
4,5,1,20/04/2014 - 16:00,atletico - pr,1 x 0,grêmio,Orlando Scarpelli - Florianopolis - SC,1.0,0.0,1.0,atletico - pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...
1895,376,38,02/12/2018 - 17:00,fluminense,1 x 0,américa,Correto,1.0,0.0,1.0,fluminense,2018
1896,377,38,02/12/2018 - 17:00,ceará,0 x 0,vasco da gama,Correto,0.0,0.0,0.0,empate,2018
1897,378,38,02/12/2018 - 17:00,sport,2 x 1,santos,Correto,2.0,1.0,3.0,sport,2018
1898,379,38,02/12/2018 - 17:00,palmeiras,3 x 2,vitória,Correto,3.0,2.0,5.0,palmeiras,2018


## Times

In [72]:
times_2018_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2018/rodada-1.csv", header=True)

times_2018_df.schema

StructType(List(StructField(_c0,StringType,true),StructField(atletas.nome,StringType,true),StructField(atletas.slug,StringType,true),StructField(atletas.apelido,StringType,true),StructField(atletas.foto,StringType,true),StructField(atletas.atleta_id,StringType,true),StructField(atletas.rodada_id,StringType,true),StructField(atletas.clube_id,StringType,true),StructField(atletas.posicao_id,StringType,true),StructField(atletas.status_id,StringType,true),StructField(atletas.pontos_num,StringType,true),StructField(atletas.preco_num,StringType,true),StructField(atletas.variacao_num,StringType,true),StructField(atletas.media_num,StringType,true),StructField(atletas.clube.id.full.name,StringType,true),StructField(FC,StringType,true),StructField(FD,StringType,true),StructField(FF,StringType,true),StructField(FS,StringType,true),StructField(G,StringType,true),StructField(I,StringType,true),StructField(RB,StringType,true),StructField(CA,StringType,true),StructField(PE,StringType,true),StructField

In [73]:
times_2018_df = times_2018_df.withColumnRenamed('atletas.clube_id', 'Abreviacao')
times_2018_df = times_2018_df.withColumnRenamed('atletas.clube.id.full.name', 'Nome')

In [74]:
times_2018_df = times_2018_df['Abreviacao', 'Nome']
times_2018_df = times_2018_df.dropDuplicates(['Abreviacao', 'Nome'])
times_2018_df.toPandas()

Unnamed: 0,Abreviacao,Nome
0,ATL,Atlético-MG
1,SAO,São Paulo
2,AME,América-MG
3,BOT,Botafogo
4,CRU,Cruzeiro
5,COR,Corinthians
6,CEA,Ceará
7,INT,Internacional
8,SAN,Santos
9,PAR,Paraná


In [75]:
#Ler tabela de referencia Times_ids
times_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/times_ids.csv", header=True)
times_df = times_df.withColumnRenamed('abreviacao', 'abbr')
times_df.limit(5).toPandas()

Unnamed: 0,nome.cbf,nome.cartola,nome.completo,cod.older,cod.2017,cod.2018,id,abbr,escudos.60x60,escudos.45x45,escudos.30x30
0,América - MG,América-MG,America MG,327,327,327,327,AME,https://s.glbimg.com/es/sde/f/organizacoes/201...,https://s.glbimg.com/es/sde/f/organizacoes/201...,https://s.glbimg.com/es/sde/f/organizacoes/201...
1,America - RN,Atlético-RN,America RN,200,200,1,200,OUT,,,
2,Atlético - GO,Atlético-GO,Atletico GO,201,373,373,373,ATL,,,
3,Atlético - MG,Atlético-MG,Atletico Mineiro,282,282,282,282,ATL,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...
4,Atlético - PR,Atlético-PR,Atletico Paranaense,293,293,293,293,ATL,https://s.glbimg.com/es/sde/f/equipes/2015/06/...,https://s.glbimg.com/es/sde/f/equipes/2015/06/...,https://s.glbimg.com/es/sde/f/equipes/2015/06/...


In [76]:
times_df = times_df.withColumnRenamed('nome.cartola', 'ncartola')

In [77]:
times = times_2018_df.join(times_df, times_2018_df.Nome == times_df.ncartola)

times.toPandas()

Unnamed: 0,Abreviacao,Nome,nome.cbf,ncartola,nome.completo,cod.older,cod.2017,cod.2018,id,abbr,escudos.60x60,escudos.45x45,escudos.30x30
0,ATL,Atlético-MG,Atlético - MG,Atlético-MG,Atletico Mineiro,282,282,282,282,ATL,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...,https://s.glbimg.com/es/sde/f/equipes/2017/11/...
1,SAO,São Paulo,São Paulo - SP,São Paulo,Sao Paulo FC,276,276,276,276,SAO,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
2,AME,América-MG,América - MG,América-MG,America MG,327,327,327,327,AME,https://s.glbimg.com/es/sde/f/organizacoes/201...,https://s.glbimg.com/es/sde/f/organizacoes/201...,https://s.glbimg.com/es/sde/f/organizacoes/201...
3,BOT,Botafogo,Botafogo - RJ,Botafogo,Botafogo RJ,263,263,263,263,BOT,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
4,CRU,Cruzeiro,Cruzeiro - MG,Cruzeiro,Cruzeiro EC,283,283,283,283,CRU,https://s.glbimg.com/es/sde/f/equipes/2015/04/...,https://s.glbimg.com/es/sde/f/equipes/2015/04/...,https://s.glbimg.com/es/sde/f/equipes/2015/04/...
5,COR,Corinthians,Corinthians - SP,Corinthians,Corinthians SP,264,264,264,264,COR,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
6,CEA,Ceará,Ceará - CE,Ceará,Ceara SC,204,204,354,354,CEA,https://s.glbimg.com/es/sde/f/equipes/2018/05/...,https://s.glbimg.com/es/sde/f/equipes/2018/05/...,https://s.glbimg.com/es/sde/f/equipes/2018/05/...
7,INT,Internacional,Internacional - RS,Internacional,Internacional,285,285,285,285,INT,https://s.glbimg.com/es/sde/f/equipes/2016/05/...,https://s.glbimg.com/es/sde/f/equipes/2016/05/...,https://s.glbimg.com/es/sde/f/equipes/2016/05/...
8,SAN,Santos,Santos - SP,Santos,Santos FC,277,277,277,277,SAN,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...,https://s.glbimg.com/es/sde/f/equipes/2013/12/...
9,PAR,Paraná,Paraná - PR,Paraná,Parana Clube,217,217,289,217,PAR,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2014/04/...,https://s.glbimg.com/es/sde/f/equipes/2014/04/...


In [78]:
times = times['Abreviacao', 'Nome', 'id']
times.toPandas()

Unnamed: 0,Abreviacao,Nome,id
0,ATL,Atlético-MG,282
1,SAO,São Paulo,276
2,AME,América-MG,327
3,BOT,Botafogo,263
4,CRU,Cruzeiro,283
5,COR,Corinthians,264
6,CEA,Ceará,354
7,INT,Internacional,285
8,SAN,Santos,277
9,PAR,Paraná,217


In [79]:
times_2018_df = times.dropDuplicates(['Abreviacao', 'Nome', 'id'])

In [80]:
times_2018_df = times_2018_df.withColumnRenamed('id', 'ID')
times_2018_df.toPandas()

Unnamed: 0,Abreviacao,Nome,ID
0,ATL,Atlético-MG,282
1,SAO,São Paulo,276
2,AME,América-MG,327
3,BOT,Botafogo,263
4,CRU,Cruzeiro,283
5,COR,Corinthians,264
6,CEA,Ceará,354
7,INT,Internacional,285
8,SAN,Santos,277
9,PAR,Paraná,217


In [81]:
#Adiciona coluna ANO = 2018
jogadores_2018_ano = times_2018_df.withColumn('year', lit(2018))
jogadores_2018_ano.toPandas()

Unnamed: 0,Abreviacao,Nome,ID,year
0,ATL,Atlético-MG,282,2018
1,SAO,São Paulo,276,2018
2,AME,América-MG,327,2018
3,BOT,Botafogo,263,2018
4,CRU,Cruzeiro,283,2018
5,COR,Corinthians,264,2018
6,CEA,Ceará,354,2018
7,INT,Internacional,285,2018
8,SAN,Santos,277,2018
9,PAR,Paraná,217,2018


In [82]:
jogadores_2018_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/times/')
#transforma arquivo em parquet
times_2018_parquet = spark.read.option('basePath', '/cartola/clean/times/').parquet('/cartola/clean/times/*')

In [83]:
times_2018_parquet.toPandas()

Unnamed: 0,ID,Nome,Abreviacao,Slug,year
0,373,Atlético-GO,ATL,Atlético - GO,2017
1,282,Atlético-MG,ATL,Atlético - MG,2017
2,293,Atlético-PR,ATL,Atlético - PR,2017
3,314,Avaí,AVA,Avaí - SC,2017
4,265,Bahia,BAH,Bahia - BA,2017
...,...,...,...,...,...
95,265,Bahia,BAH,,2018
96,292,Sport,SPO,,2018
97,267,Vasco,VAS,,2018
98,293,Atlético-PR,ATL,,2018


## scouts_raw

In [63]:
# Analisando o arquivo 2014_lances.csv vimos que não é necessário processar esse arquivo, pois as informações relevantes estão em scouts_raw.

In [84]:
scouts_raw_2018_df = spark.read.option("encoding", "UTF-8").csv("/cartola/data/2018/rodada-*.csv", header=True)

In [85]:
scouts_raw_2018_ano = scouts_raw_2018_df.withColumn('year', lit(2018))

In [86]:
scouts_raw_2018_ano.toPandas()

Unnamed: 0,_c0,atletas.nome,atletas.slug,atletas.apelido,atletas.foto,atletas.atleta_id,atletas.rodada_id,atletas.clube_id,atletas.posicao_id,atletas.status_id,...,PE,RB,SG,CV,DD,GS,PP,GC,DP,year
0,1,Diego Fabián Torres,diego-torres,Diego Torres,https://s.glbimg.com/es/sde/f/2018/08/08/403db...,98794,38,CHA,mei,Provável,...,33,11,,,,,,,,2018
1,2,José Marcos Costa Martins,marquinhos,Marquinhos,https://s.glbimg.com/es/sde/f/2018/09/03/e0930...,98765,38,CHA,mei,Nulo,...,2,3,,,,,,,,2018
2,3,Marcelo dos Santos Rosa,marcelo,Marcelo,https://s.glbimg.com/es/sde/f/2018/05/08/38e74...,98874,38,ATL,ata,Nulo,...,,,,,,,,,,2018
3,4,Luis Antônio Venker de Menezes,mano-menezes,Mano Menezes,https://s.glbimg.com/es/sde/f/2018/05/18/b67ea...,37281,38,CRU,tec,Provável,...,,,,,,,,,,2018
4,5,Pedro Gomes Bortoluzo,pedro-bortoluzo,Pedro Bortoluzo,https://s.glbimg.com/es/sde/f/2018/09/04/08e0c...,99041,38,SAO,ata,Nulo,...,,,,,,,,,,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30950,785,Gilsivan Soares da Silva,ivan,Ivan,https://s.glbimg.com/es/sde/f/2018/03/02/66a48...,80326,1,CHA,gol,Nulo,...,,,,,,,,,,2018
30951,786,Kendy Tateishi Berbel,kendy,Kendy,https://s.glbimg.com/es/sde/f/2018/03/08/b7e98...,98421,1,CHA,ata,Nulo,...,,,,,,,,,,2018
30952,787,Nadson da Silva Almeida,nadson,Nadson,https://s.glbimg.com/es/sde/f/2017/04/25/9c026...,80140,1,CHA,mei,Nulo,...,,,,,,,,,,2018
30953,788,Khevin Rodrigo Fraga,khevin,Khevin,https://s.glbimg.com/es/sde/f/2018/03/08/88f9d...,99921,1,CHA,mei,Contundido,...,,,,,,,,,,2018


In [71]:
scouts_raw_2017_ano = scouts_raw_2017_ano.withColumnRenamed('atletas.atleta_id', 'AtletaID')
scouts_raw_2017_ano = scouts_raw_2017_ano.withColumnRenamed('atletas.clube_id', 'ClubeID')
scouts_raw_2017_ano = scouts_raw_2017_ano.withColumnRenamed('atletas.pontos_num', 'Pontos')
scouts_raw_2017_ano = scouts_raw_2017_ano.withColumnRenamed('atletas.preco_num', 'Preco')
scouts_raw_2017_ano = scouts_raw_2017_ano.withColumnRenamed('atletas.variacao_num', 'PrecoVariacao')
scouts_raw_2017_ano = scouts_raw_2017_ano.withColumnRenamed('atletas.preco_num', 'Preco')



In [68]:
scouts_raw_2017_ano = scouts_raw_2017_ano.drop('scout', 'atletas.apelido', 'atletas.clube.id.full.name', 'atletas.foto', 'atletas.jogos_num', 'atletas.nome', 'atletas.posicao_id', 'atletas.status_id',  )

In [72]:
scouts_raw_2017_ano = scouts_raw_2017_ano.withColumn("Pontos", scouts_raw_2017_ano["Pontos"].cast(FloatType()))

In [73]:
scouts_raw_2017_ano.write.mode('overwrite').partitionBy('year').parquet('/cartola/clean/scouts/')
#transforma arquivo em parquet
scouts_raw_2017_parquet = spark.read.option('basePath', '/cartola/clean/scouts/').parquet('/cartola/clean/scouts/*')

In [74]:
scouts_raw_2017_parquet.toPandas()

Unnamed: 0,AtletaID,Rodada,ClubeID,Participou,Posicao,Jogos,Pontos,PontosMedia,Preco,PrecoVariacao,...,RB,FC,GC,CA,CV,SG,DD,DP,GS,year
0,36540,0,FLA,,,,0.0,,5,0,...,,,,,,,,,,2017
1,36612,0,PAL,,,,0.0,,8,0,...,,,,,,,,,,2017
2,36943,0,ATL,,,,0.0,,10,0,...,,,,,,,,,,2017
3,37245,0,BAH,,,,0.0,,4,0,...,,,,,,,,,,2017
4,37246,0,SPO,,,,0.0,,4,0,...,,,,,,,,,,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125648,97451,38,294,FALSE,,,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2016
125649,97450,38,294,FALSE,,,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2016
125650,97460,38,344,TRUE,,,2.0,2,1.78,0.78,...,1,1,0,0,0,0,0,0,0,2016
125651,82626,38,285,FALSE,,,0.0,0,1,0,...,0,0,0,0,0,0,0,0,0,2016


In [77]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2017_ano[scouts_raw_2017_ano['year'] == 2017]

jogadores_2017 = jogadores_2017_parquet[jogadores_2017_parquet['year'] == 2017]

pontos_por_atleta = pontos_por_atleta.groupBy("AtletaID").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()

Unnamed: 0,AtletaID,SomaPontos
0,62121,280.199999
1,82455,221.000001
2,50284,203.299999
3,61149,196.299999
4,73281,182.500001
...,...,...
1010,97528,-6.500000
1011,98352,-6.800000
1012,90569,-7.200000
1013,37688,-8.100000


In [78]:
scouts_atletas = pontos_por_atleta.join(jogadores_2017, pontos_por_atleta.AtletaID == jogadores_2017.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

Unnamed: 0,AtletaID,SomaPontos,ID,Apelido,ClubeID,PosicaoID,year
0,62121,280.199999,62121,Vanderlei,277,1,2017
1,82455,221.000001,82455,Zé Rafael,265,4,2017
2,50284,203.299999,50284,Wilson,294,1,2017
3,61149,196.299999,61149,Lucca,303,5,2017
4,73281,182.500001,73281,Renê Júnior,265,4,2017
...,...,...,...,...,...,...,...
746,97528,-6.500000,97528,Frazan,266,3,2017
747,98352,-6.800000,98352,Bruno Guimarães,293,4,2017
748,90569,-7.200000,90569,Yuri,277,4,2017
749,37688,-8.100000,37688,Cícero,276,4,2017


<img src="https://s2.glbimg.com/WmFP3xwH6VxMuCvR72fl8ldWurA=/0x0:690x6668/984x0/smart/filters:strip_icc()/i.s3.glbimg.com/v1/AUTH_bc8228b6673f488aa253bbcb03c80ec5/internal_photos/bs/2017/z/h/grGKnPRgmW6FtnMNiUNg/info-cartolao-2017-v4.jpg" width="800" height="400">

## DEMONSTRAÇÃO

In [None]:
partidas_2014_df = spark.read.csv("/cartola/data/2014/2014_jogadores.csv", header=True)
partidas_2014_ct = partidas_2014_df.withColumn('time', regexp_replace('home_team', ' - RJ', ''))
final_partidas = partidas_2014_ct.withColumn('time_low', lower(col('time'))).show(truncate=False)

In [None]:
with_ano_partidas = partidas_2014_ct.withColumn('ano', lit(2014)).show(truncate=False)
with_ano_partidas.show()

In [None]:
Comando para sobrescrever arquivo caso já existente.
Agrupar scouts por ID e contar (caso tenha duplicidade)

In [85]:
inner_join = partidas_ids_2014_ano.join(times_2014_ano, partidas_ids_2014_ano.Casa == times_2014_ano.ID)
inner_join.toPandas()

Unnamed: 0,ID,Rodada,Casa,Visitante,PlacarCasa,PlacarVisitante,Resultado,ano,ID.1,Nome,Abreviacao,Slug,ano.1
0,179872,1,262,290,0,0,Empate,2014,262,flamengo,FLA,flamengo,2014
1,179873,1,266,316,3,0,Casa,2014,266,fluminense,FLU,fluminense,2014
2,179874,1,276,263,3,0,Casa,2014,276,são paulo,SAO,sao-paulo,2014
3,179875,1,277,292,1,1,Empate,2014,277,santos,SAN,santos,2014
4,179876,1,293,284,1,0,Casa,2014,293,atlético-pr,CAP,atletico-pr,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,180250,29,282,315,1,0,Casa,2014,282,atlético-mg,CAM,atletico-mg,2014
376,180251,29,287,283,0,1,Visitante,2014,287,vitória,VIT,vitoria,2014
377,180252,29,285,264,1,2,Visitante,2014,285,internacional,INT,internacional,2014
378,180253,29,316,294,4,0,Casa,2014,316,figueirense,FIG,figueirense,2014


In [None]:
scouts_atletas = pontos_por_atleta.join(jogadores_2014_parquet, pontos_por_atleta.Atleta == jogadores_2014_parquet.ID)
scouts_atletas = scouts_atletas.sort(scouts_atletas.SomaPontos.desc())
scouts_atletas.toPandas()

In [None]:
# -        Quantas partidas resultaram em empate?
pontos_por_atleta = scouts_raw_2014_ano.groupBy("Atleta").agg(sum("Pontos").alias("SomaPontos"))

#count_result = count_result.withColumn('total_wins', count_result['count(1)'])

pontos_por_atleta = pontos_por_atleta.sort(pontos_por_atleta.SomaPontos.desc())

#count_result = count_result.drop('count(1)')

#count_result.show(5)


pontos_por_atleta.toPandas()