In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 

%matplotlib inline

## Loading the data

In [None]:
data_path = "CrowdstormingDataJuly1st.csv"
df = pd.read_csv(data_path)

In [None]:
df.sample(5)

In [None]:
print("Number of entries: %d" % len(df))

We can see that there are many players without skin color ratings. Those aren't going to be useful for us

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=["rater1", "rater2"])
print(len(df))

In [None]:
df_by_player = df.groupby("playerShort")
df_players = df_by_player.agg(np.mean)
df_players.head()

In [None]:
# Create a df with player constant description attributes
df_players_description = pd.DataFrame(df_players[["height", "weight", "rater1", "rater2"]])
df_players_description.sample(5)

## Raters consistency

We suspect that the raters have a certain bias and do not always rate the same player the same way. We look at the differences

In [None]:
(df_players_description["rater1"] - df_players_description["rater2"]).describe()

We see that rater2 rates the skintone higher than rater1 on average. 
We now make a new attribute that is the mean of rater1 and rater2's scores. 

In [None]:
df_players_description["rateMean"] = (df_players_description["rater1"] + df_players_description["rater2"]) / 2

In [None]:
df_players_description.sample(5)

Since random forest uses categorical classification, and since we decided to have a binary attribute "darkSkin", we need to choose the limit between "white" and "black". We arbitrarily chose mean rate equal and over 0.5 to be considered "black".

In [None]:
df_players_description['darkSkin'] = df_players_description['rateMean']  >= 0.5
df_players_description.head(10)

In [None]:
print(df_players_description.isnull().sum())

In [None]:
df_players_description =  df_players_description.dropna()

## Random forest machine learning

Let's try a simple machine learning task: considering only height and weight, try to obtain the player's skin color. For that we use a random forest classifier.

In [None]:
from sklearn.ensemble import RandomForestClassifier
X = df_players_description[['height', 'weight']]
y = df_players_description['darkSkin']
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X,y)

In [None]:
from sklearn import metrics
y_pred = clf.predict(X)
print('The accuracy is {0:.2f}%'.format(metrics.accuracy_score(y, y_pred)*100))

  81% accuracy can seem pretty good, but we must remember that we are training on the whole dataset so it doesn't mean much. If we would try to predict on unseen data, the result would be poor as we are probably overfitting the training set.

In [None]:
from sklearn.cross_validation import train_test_split

# Split the data into 60% of training set, and 40% of test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('The accuracy is {0:.2f}%'.format(metrics.accuracy_score(y_test, y_pred)*100))

Training on 60% of the data gives around 70% accuracy, which is still good. Let's try with a 20-fold cross validation.

In [None]:
from sklearn.cross_validation import cross_val_score

clf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(clf, X, y, cv=20, scoring='accuracy')
pd.Series(scores).describe()

Median score is 72%, and the standard deviation is not too high. That a decent score, considering that we are only looking at two features: height and weight. Let's try to add game features and see how the score changes.

In [None]:
df_players["darkSkin"] = df_players_description["darkSkin"]
df_players.head(5)

In [None]:
def random_forest_scores(dataframe, features, target="darkSkin", estimators=10, folds=20):
    clf = RandomForestClassifier(n_estimators=estimators)
    X = dataframe[features]
    y = list(dataframe["darkSkin"].values)
    
    # Cross validation scores
    scores = cross_val_score(clf, X, y, cv=folds, scoring='accuracy')
    
    # Feature importances
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.4)
    clf.fit(X_train, y_train)
    
    return scores, clf.feature_importances_

In [None]:
df_players = df_players.dropna()

In [None]:
possible_features = ['height', 'weight', 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards']

In [None]:
scores, importances = random_forest_scores(df_players, possible_features)

In [None]:
pd.Series(scores).describe()

Adding features adds around 3% of precision.

In [None]:
df_feature_importances = pd.DataFrame({"features": possible_features, "importances": importances})
df_feature_importances = df_feature_importances.set_index("features")
df_feature_importances = df_feature_importances.sort_values("importances", ascending=False)
df_feature_importances.plot(kind="bar")

In [None]:
possible_features_2 = list(possible_features)
possible_features_2.remove("goals")

In [None]:
scores2, importances2 = random_forest_scores(df_players, possible_features_2)

In [None]:
pd.Series(scores2).describe()

In [None]:
pd.Series(scores).describe()