In [26]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split

%matplotlib inline

Import the csv file containing all the data

In [2]:
filename = 'CrowdstormingDataJuly1st.csv'
df = pd.read_csv(filename)

In [3]:
df.head()

Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
0,lucas-wilchez,Lucas Wilchez,Real Zaragoza,Spain,31.08.1983,177.0,72.0,Attacking Midfielder,1,0,...,0.5,1,1,GRC,0.326391,712.0,0.000564,0.396,750.0,0.002696
1,john-utaka,John Utaka,Montpellier HSC,France,08.01.1982,179.0,82.0,Right Winger,1,0,...,0.75,2,2,ZMB,0.203375,40.0,0.010875,-0.204082,49.0,0.061504
2,abdon-prats,Abdón Prats,RCD Mallorca,Spain,17.12.1992,181.0,79.0,,1,0,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
3,pablo-mari,Pablo Marí,RCD Mallorca,Spain,31.08.1993,191.0,87.0,Center Back,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002
4,ruben-pena,Rubén Peña,Real Valladolid,Spain,18.07.1991,172.0,70.0,Right Midfielder,1,1,...,,3,3,ESP,0.369894,1785.0,0.000229,0.588297,1897.0,0.001002


We want to, given the description of a player, be able to determine successfully the skin color tone of that player. For that, we use the concept of random forests.

## Some data cleaning

We get rid of the rows were not both raters have given their "skin value".

In [4]:
df['leagueCountry'].unique()

array(['Spain', 'France', 'England', 'Germany'], dtype=object)

In [5]:
_df = df.loc[(df["rater1"].isnull() == False) & (df["rater2"].isnull() == False) & (df["height"].isnull() == False) & (df["weight"].isnull() == False)]

In [27]:
rf = RandomForestClassifier(max_depth=5, n_estimators=10)
rfr = RandomForestRegressor(max_depth=5, n_estimators=10)

Let's prepare our X and y dataset

In [7]:
y_2 = _df[['rater1', 'rater2']]

In [15]:
y_2['raterAvg'] = np.average(y_2, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [17]:
y = y_2['raterAvg']

In [22]:
y

0         0.375
1         0.750
5         0.125
6         0.125
7         1.000
8         0.250
9         0.000
10        0.000
11        0.500
12        0.000
13        0.000
14        0.125
15        0.000
16        0.500
17        0.000
18        0.000
19        0.125
20        0.125
21        0.000
22        0.000
23        0.000
24        0.125
25        0.250
26        0.125
27        0.125
28        0.000
29        0.125
30        0.000
31        0.000
32        0.000
          ...  
145997    0.000
145998    1.000
145999    0.500
146000    0.875
146001    0.250
146002    0.250
146003    0.000
146004    1.000
146005    0.250
146006    0.125
146007    0.125
146008    0.625
146009    0.000
146010    0.250
146011    0.750
146012    0.000
146013    0.125
146014    0.125
146015    1.000
146016    0.000
146017    1.000
146018    0.000
146019    0.125
146020    0.000
146021    0.875
146022    0.500
146023    0.000
146024    0.375
146025    0.250
146027    0.125
Name: raterAvg, dtype: f

In [18]:
X = _df[['height', 'weight', 'yellowCards', 'yellowReds', 'redCards']]

In [19]:
X.head()

Unnamed: 0,height,weight,yellowCards,yellowReds,redCards
0,177.0,72.0,0,0,0
1,179.0,82.0,1,0,0
5,182.0,71.0,0,0,0
6,187.0,80.0,0,0,0
7,180.0,68.0,0,0,0


We split it for creating a training and a testing set

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [28]:
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [23]:
X_train.head()

Unnamed: 0,height,weight,yellowCards,yellowReds,redCards
84918,191.0,85.0,0,0,0
5628,179.0,77.0,0,0,0
65406,173.0,72.0,0,0,0
54533,196.0,94.0,0,0,0
64575,161.0,60.0,0,0,0


In [None]:
from sklearn import preprocessing

In [None]:
X_scaled = preprocessing.scale(X_train) # TODO

In [29]:
rfr.feature_importances_

array([  5.25111238e-01,   4.74825473e-01,   6.32888304e-05,
         0.00000000e+00,   0.00000000e+00])

In [30]:
y_pred = rfr.predict(X_test)

In [31]:
rfr.score(X_test, y_test)

0.071547960618241424

In [32]:
y_pred

array([ 0.27769374,  0.31283833,  0.31283833, ...,  0.27769374,
        0.27769374,  0.27769374])

In [33]:
y_test

74652     0.000
22866     0.250
127063    0.250
63338     0.000
59724     0.750
59214     0.250
52598     0.000
119837    0.250
83184     0.000
80899     0.500
49378     0.750
127194    0.000
273       0.250
121315    0.000
26615     0.125
109430    0.500
5119      0.000
95989     0.250
6504      0.375
43831     0.875
110930    0.000
145565    0.000
68663     0.875
36366     0.250
92758     0.000
23393     0.000
34653     0.250
116912    0.250
117780    0.000
4247      0.000
          ...  
96665     0.250
69467     0.375
81973     1.000
97802     0.125
19627     0.500
61614     0.125
136127    0.250
111863    0.250
119006    0.000
138059    0.125
136327    0.000
25475     0.375
1386      0.375
122702    1.000
15815     1.000
21207     0.000
112546    0.000
52989     0.125
24873     1.000
129734    0.250
26302     0.250
26949     0.000
16960     0.000
143482    0.250
13271     0.250
98893     0.250
97074     0.250
104491    0.250
64993     0.000
5882      0.250
Name: raterAvg, dtype: f