In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 

%matplotlib inline

## Loading the data

In [2]:
data_path = "CrowdstormingDataJuly1st.csv"
df = pd.read_csv(data_path)

In [3]:
df.sample(5)

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
103739,laurent-bonnart,Laurent Bonnart,Lille OSC,France,25.12.1979,170.0,62.0,Left Fullback,1,0,...,0.25,2230,72,PRT,0.396803,1079.0,0.000392,0.790366,1121.0,0.001798
87664,louis-saha,Louis Saha,Sunderland AFC,England,08.08.1978,184.0,75.0,Center Forward,18,7,...,1.0,1909,44,ENGL,0.32669,44791.0,1e-05,0.356446,46916.0,3.7e-05
72247,ilir-azemi,Ilir Azemi,SpVgg Greuther Fürth,Germany,21.02.1992,191.0,91.0,Center Forward,1,0,...,0.0,1571,8,DEU,0.336628,7749.0,5.5e-05,0.335967,7974.0,0.000225
12913,natxo-insa,Natxo Insa,Celta Vigo,Spain,09.06.1986,177.0,70.0,Attacking Midfielder,5,1,...,0.5,278,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
54432,joe-bennett,Joe Bennett,Aston Villa,England,28.03.1990,177.0,74.0,,1,0,...,,1092,44,ENGL,0.32669,44791.0,1e-05,0.356446,46916.0,3.7e-05


In [4]:
print("Number of entries: %d" % len(df))

Number of entries: 146028


We can see that there are many players without skin color ratings. Those aren't going to be useful for us

In [5]:
df.isnull().sum()

playerShort          0
player               0
club                 0
leagueCountry        0
birthday             0
height             263
weight            2243
position         17726
games                0
victories            0
ties                 0
defeats              0
goals                0
yellowCards          0
yellowReds           0
redCards             0
photoID          21407
rater1           21407
rater2           21407
refNum               0
refCountry           0
Alpha_3              1
meanIAT            163
nIAT               163
seIAT              163
meanExp            163
nExp               163
seExp              163
dtype: int64

In [6]:
df = df.dropna(subset=["rater1", "rater2"])
print(len(df))

124621


In [7]:
df_by_player = df.groupby("playerShort")
df_players = df_by_player.agg(np.mean)
df_players.head()

Unnamed: 0_level_0,height,weight,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,rater1,rater2,refNum,refCountry,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
aaron-hughes,182.0,71.0,3.939759,1.487952,1.078313,1.373494,0.054217,0.114458,0.0,0.0,0.25,0.0,1612.656627,43.921687,0.346459,19710.156627,0.000652,0.494575,20637.277108,0.004065
aaron-hunt,183.0,73.0,3.393939,1.424242,0.737374,1.232323,0.626263,0.424242,0.0,0.010101,0.0,0.25,1662.515152,25.070707,0.348818,26104.292929,0.000219,0.44922,26864.454545,0.000993
aaron-lennon,165.0,63.0,4.079208,1.980198,0.960396,1.138614,0.306931,0.108911,0.0,0.0,0.25,0.25,1598.871287,42.772277,0.345893,21234.861386,0.000367,0.491482,22238.742574,0.002032
aaron-ramsey,178.0,76.0,2.5,1.442308,0.403846,0.653846,0.375,0.298077,0.0,0.009615,0.0,0.0,1668.5,45.067308,0.346821,38285.826923,0.003334,0.514693,39719.980769,0.013522
abdelhamid-el-kaoutari,180.0,73.0,3.351351,1.108108,1.081081,1.162162,0.027027,0.216216,0.108108,0.054054,0.25,0.25,1610.891892,17.189189,0.3316,2832.351351,0.001488,0.335587,2953.837838,0.005296


In [8]:
# Create a df with player constant description attributes
df_players_description = df_players[["height", "weight", "rater1", "rater2"]]
df_players_description.sample(5)

Unnamed: 0_level_0,height,weight,rater1,rater2
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
lukas-rupp,178.0,73.0,0.25,0.25
ron-robert-zieler,188.0,83.0,0.25,0.0
jannik-loehden,201.0,97.0,0.0,0.0
anderson_7,176.0,69.0,0.75,0.75
punal,179.0,73.0,0.25,0.25


## Raters consistency

We suspect that the raters have a certain bias and do not always rate the same player the same way. We look at the differences

In [9]:
(df_players_description["rater1"] - df_players_description["rater2"]).describe()

count    1585.000000
mean       -0.041798
std         0.115943
min        -0.500000
25%         0.000000
50%         0.000000
75%         0.000000
max         0.250000
dtype: float64

We see that rater2 rates the skintone higher than rater1 on average. 
We now make a new attribute that is the mean of rater1 and rater2's scores. 

In [35]:
df_players_description["rateMean"] = (df_players_description["rater1"] + df_players_description["rater2"]) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [139]:
df_players_description.sample(5)

Unnamed: 0_level_0,height,weight,rater1,rater2,rateMean,darkSkin
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
jens-grahl,193.0,86.0,0.25,0.25,0.25,False
sergio-ramos,183.0,75.0,0.0,0.0,0.0,False
fabrice-begeorgi,177.0,74.0,0.0,0.0,0.0,False
kossi-agassa,190.0,83.0,1.0,1.0,1.0,True
thomas-mueller,186.0,74.0,0.0,0.25,0.125,False


In [155]:
# We choose >= blabla comment 
df_players_description['darkSkin'] = df_players_description['rateMean']  >= 0.5
df_players_description.head(10)

Unnamed: 0_level_0,height,weight,rater1,rater2,rateMean,darkSkin
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aaron-hughes,182.0,71.0,0.25,0.0,0.125,False
aaron-hunt,183.0,73.0,0.0,0.25,0.125,False
aaron-lennon,165.0,63.0,0.25,0.25,0.25,False
aaron-ramsey,178.0,76.0,0.0,0.0,0.0,False
abdelhamid-el-kaoutari,180.0,73.0,0.25,0.25,0.25,False
abdou-traore_2,180.0,74.0,0.75,0.75,0.75,True
abdoulaye-diallo_2,189.0,80.0,0.75,1.0,0.875,True
abdoulaye-keita_2,188.0,83.0,0.75,1.0,0.875,True
abdoulwhaid-sissoko,180.0,68.0,1.0,1.0,1.0,True
abdul-rahman-baba,179.0,70.0,0.75,1.0,0.875,True


In [156]:
from sklearn.ensemble import RandomForestClassifier
df_players_description =  df_players_description.dropna()
X = df_players_description[['height', 'weight']]
y = df_players_description['darkSkin']
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [157]:
from sklearn import metrics
y_pred = clf.predict(X)
print('The accuracy is {0:.2f}%'.format(metrics.accuracy_score(y, y_pred)*100))

The accuracy is 81.07%


  81% accuracy can seem pretty good, but we must remember that we are training on the whole dataset so it doesn't mean much.

In [158]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('The accuracy is {0:.2f}%'.format(metrics.accuracy_score(y_test, y_pred)*100))

The accuracy is 68.85%
