In [1]:
import numpy as np
import pandas as pd

In [2]:
df_rating = pd.read_pickle('players_rating_clean.zip')
df_players = pd.read_csv('titled_players.csv', index_col=0)

In [3]:
df_merged = df_rating.merge(df_players[['id', 'nom', 'titre', 'pays', 'naissance', 'sexe', 'actif']], how='left', on='id')

In [4]:
df_merged.sample(5)

Unnamed: 0,id,date,classique_elo,classique_parties,rapide_elo,rapide_parties,blitz_elo,blitz_parties,nom,titre,pays,naissance,sexe,actif
1543424,7900325,2014-05-01,2153.0,10.0,2108.0,7.0,2192.0,10.0,"Bengherabi, Khalil",FM,ALG,1979.0,M,1
165814,4200284,2020-05-01,2285.0,0.0,,,,,"Liverios, Thisefs",FM,GRE,1949.0,M,0
2443300,5111080,2014-01-01,2390.0,9.0,2257.0,0.0,2242.0,0.0,"Torres Rosas, Luis Carlos",IM,MEX,1995.0,M,1
2093334,2267489,2019-03-01,2097.0,0.0,2063.0,0.0,2075.0,0.0,"Pascual Palomo, Lucia",WFM,ESP,1988.0,F,1
200391,3510557,2014-01-01,2259.0,0.0,,,,,"Boudy Bueno, Julio Leonardo",IM,FID,1951.0,M,0


In [5]:
df_model = df_merged[df_merged['date'] >= '2013-01-01'][df_merged[df_merged['date'] >= '2013-01-01']['classique_elo'].notna()]

In [6]:
df_model.shape[0] / df_rating.shape[0]

0.6952351499332294

In [7]:
df_model.head()

Unnamed: 0,id,date,classique_elo,classique_parties,rapide_elo,rapide_parties,blitz_elo,blitz_parties,nom,titre,pays,naissance,sexe,actif
0,4611870,2021-09-01,2290.0,0.0,,,,,"Keller-Hermann, Edith",WGM,GER,1921.0,F,0
1,4611870,2021-08-01,2290.0,0.0,,,,,"Keller-Hermann, Edith",WGM,GER,1921.0,F,0
2,4611870,2021-07-01,2290.0,0.0,,,,,"Keller-Hermann, Edith",WGM,GER,1921.0,F,0
3,4611870,2021-06-01,2290.0,0.0,,,,,"Keller-Hermann, Edith",WGM,GER,1921.0,F,0
4,4611870,2021-05-01,2290.0,0.0,,,,,"Keller-Hermann, Edith",WGM,GER,1921.0,F,0


Essayons de prédire le sexe d'un joueur suivant son classement, âge 

In [8]:
df_first_attempt = df_model[['id', 'classique_elo', 'classique_parties', 'naissance', 'sexe']]

In [9]:
df_first_attempt['sexe'].value_counts(normalize=True)

M    0.804803
F    0.195197
Name: sexe, dtype: float64

In [10]:
u = df_first_attempt.groupby('id').agg({'classique_elo':np.mean, 'classique_parties':np.mean, 'naissance':np.mean,'sexe':np.max})

In [11]:
z = df_first_attempt.groupby('id').std(ddof=0)[['classique_elo', 'classique_parties']]

In [12]:
z.columns = ['classique_elo_std', 'classique_parties_std']

In [13]:
a = u.merge(z, how='left', on='id').merge(df_players[['id','pays','actif']], how='left', on='id')

In [14]:
a = a[['classique_elo','classique_elo_std', 'classique_parties', 'classique_parties_std', 'naissance', 'sexe', 'pays','actif']]

In [15]:
a = pd.get_dummies(a, columns=['pays'])
a.head()

Unnamed: 0,classique_elo,classique_elo_std,classique_parties,classique_parties_std,naissance,sexe,actif,pays_AFG,pays_AHO,pays_ALB,...,pays_UKR,pays_URU,pays_USA,pays_UZB,pays_VEN,pays_VIE,pays_WLS,pays_YEM,pays_ZAM,pays_ZIM
0,2437.52381,21.231197,5.457143,7.008877,1957.0,M,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2422.0,0.0,0.0,0.0,1947.0,M,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2435.361905,29.320925,2.104762,4.040094,1968.0,M,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2425.0,0.0,0.0,0.0,1958.0,M,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2340.0,0.0,0.0,0.0,1958.0,M,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
df_train, df_test = train_test_split(a, test_size=0.2, random_state=0)

In [18]:
temp = df_train.copy()
temp.pop('sexe')
cols = temp.columns
# cols = ['classique_elo', 'classique_parties']
X_train = df_train[cols]
y_train = df_train['sexe']
X_test = df_test[cols]
y_test = df_test['sexe']


In [19]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [20]:
f1_score(y_test, predictions, average='micro')

0.8790841265757655