In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AffinityPropagation
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.cluster import MeanShift, estimate_bandwidth
from itertools import cycle
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
import random
from sklearn.cross_validation import train_test_split
from scipy.spatial.distance import cdist

pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Data Source : https://github.com/llimllib/bostonmarathon/blob/master/results/2013/results.csv

In [None]:
raw_data = pd.read_csv("bm_results_2013.csv")
data=raw_data

In [None]:
data.describe()
# Not all numeric features with what appears to be a number are formatted numerically, will need to convert.

In [None]:
data = data.convert_objects(convert_numeric=True)

In [None]:
data.info()

In [None]:
data.head(5)

In [None]:
#drop citizenship,state, bib which have a lot of missing values.  Drop name, country and city as well.  Dummy features for city and country created over 16,000 features
data = data.drop(["ctz","state","bib","name","city","country","division"],axis=1)

In [None]:
data["gender"] = np.where(data.gender == "M",0,1)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
len(data)

In [None]:
data = data.dropna()



In [None]:
len(data)

In [None]:
#Look at distributions
data.hist(figsize=(10,10))
plt.show()

In [None]:
#drop citizenship,state, bib which have a lot of missing values.  Drop name, country and city as well.  Dummy features for city and country created over 16,000 features
#data = data.drop(["10k","20k","25k","30k","35k","5k"],axis=1)

In [None]:
test, train = train_test_split(data, test_size=.9)
norm_train=normalize(train)

In [None]:
norm_train

In [None]:
#See descriptive statistics
train.describe()

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth

# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
bandwidth = estimate_bandwidth(norm_train, quantile=0.2, n_samples=15000)

# Declare and fit the model.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(norm_train)

# Extract cluster assignments for each data point.
labels = ms.labels_

# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_

# Count our clusters.
n_clusters_ = len(np.unique(labels))

print("Number of estimated clusters: {}".format(n_clusters_))

In [None]:
mean_shift_cluster = ms.fit_predict(norm_train)
train["mean_shift"] = mean_shift_cluster

In [None]:
train

In [None]:
train.mean_shift.value_counts()

### K-Means

In [None]:
def k_means_model(k,version):
    prediction = KMeans(n_clusters=k,random_state=23).fit_predict(norm_train)
    train[version] = prediction
    train.boxplot(column="age", by=version,figsize=(5,7))
    train.boxplot(column="overall", by=version,figsize=(5,7))
    train.hist(column="gender", by=version)
    plt.show()

    
    

In [None]:
k_means_model(2,"Number of k:2")

In [None]:
k_means_model(3,"Number of k:3")

Using the elbow method a K of 3 was used for the primary K means model.  This model created three clusters divided most strongly by overall performance, and secondarily by age and gender.  Cluster two had the top performing runners, cluster one had the middle group, and cluster 0 had the bottom performers.  Gender had a strong impact on which cluster a runner was assigned to.  In cluster 0 and 1, the top and bottom performers, mostly males were in those groups.  In cluster 3, the bottom group, it was much more balanced of a distribution. Age also had an impact in dividing up the runners, the top performing cluster had the lowest median age, followed by the middle cluster, and bottom cluster 0 having the oldest runners.

In [None]:
k_means_model(4,"Number of k:4")

In [None]:
k_means_model(5,"Number of k:5")

In [None]:
# K Means with 4 Clusters
k_4_pred = KMeans(n_clusters=4,random_state=23).fit_predict(norm_train)
train["k_means_4"] = k_4_pred
k_4_1 = train.loc[train["k_means_4"] == 0]
k_4_2 = train.loc[train["k_means_4"] == 1]
k_4_3 = train.loc[train["k_means_4"] == 2]
k_4_4 = train.loc[train["k_means_4"] == 3]

In [None]:
# K Means with 3 Clusters
k_3_pred = KMeans(n_clusters=3,random_state=23).fit_predict(norm_train)
train["k_means_3"] = k_3_pred
k_3_1 = train.loc[train["k_means_3"] == 0]
k_3_2 = train.loc[train["k_means_3"] == 1]
k_3_3 = train.loc[train["k_means_3"] == 2]

In [None]:
for sample in [k_3_1, k_3_2, k_3_3]:
    model = KMeans(n_clusters=3, random_state=42).fit(sample)
    labels = model.labels_
    print(metrics.silhouette_score(sample, labels, metric='euclidean'))

In [None]:
# K Means with 4 Clusters
k_3_pred = KMeans(n_clusters=3,random_state=23).fit_predict(norm_train)
train["k_means_3"] = k_3_pred
k_4_1 = train.loc[train["k_means_4"] == 0]
k_4_2 = train.loc[train["k_means_4"] == 1]
k_4_3 = train.loc[train["k_means_4"] == 2]
k_4_4 = train.loc[train["k_means_4"] == 3]

In [None]:
for sample in [k_4_1, k_4_2, k_4_3,k_4_4]:
    model = KMeans(n_clusters=3, random_state=42).fit(sample)
    labels = model.labels_
    print(metrics.silhouette_score(sample, labels, metric='euclidean'))

In [None]:
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(norm_train)
    kmeanModel.fit(train)
    distortions.append(sum(np.min(cdist(train, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / train.shape[0])
 
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
distorsions = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(norm_train)
    distorsions.append(kmeans.inertia_)

#fig = plt.figure(figsize=(15, 5))
plt.plot(range(1, 10), distorsions)
plt.grid(True)
plt.title('Elbow curve')

Using the elbow method a K of 3 was used for the primary K means model.  This model created three clusters divided most strongly by overall performance, and secondarily by age and gender.  Cluster two had the top performing runners, cluster one had the middle group, and cluster 0 had the bottom performers.