## Intro to Machine Learning
#### Teacher: Carl Shan

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.cluster import KMeans, DBSCAN
%pylab inline

Populating the interactive namespace from numpy and matplotlib


#### Step 0: Loading and inspecting my data

In [5]:
data = pd.read_csv("student/student-mat.csv", sep=';')

In [8]:
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


#### Step 1: Getting just the subset of the data that is numeric

In [9]:
data.dtypes

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

In [10]:
# Selecting just the columns that are not 'objects'. 'Object' columns are categorical colums. We want everything else.
numeric_cols = data.dtypes[data.dtypes != object]

In [11]:
numeric_cols

age           int64
Medu          int64
Fedu          int64
traveltime    int64
studytime     int64
failures      int64
famrel        int64
freetime      int64
goout         int64
Dalc          int64
Walc          int64
health        int64
absences      int64
G1            int64
G2            int64
G3            int64
dtype: object

In [12]:
### Let's convert the numeric_cols variable to a list, which our DataFrame can work with. We can do this by
### accessing the .index attribute of the numeric_cols variable.
numeric_cols_names = numeric_cols.index

In [13]:
numeric_cols_names

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2',
       'G3'],
      dtype='object')

In [14]:
cluster_data = data[numeric_cols_names] #getting all the numerical data

In [15]:
cluster_data.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,18,4,4,2,2,0,4,3,4,1,1,3,6,5,6,6
1,17,1,1,1,2,0,5,3,3,1,1,3,4,5,5,6
2,15,1,1,1,2,3,4,3,2,2,3,3,10,7,8,10
3,15,4,2,1,3,0,3,2,2,1,1,5,2,15,14,15
4,16,3,3,1,2,0,4,3,2,1,2,5,4,6,10,10


#### Step 2: Creating a K-Means model with K=2, and then fitting it

In [16]:
kmeans_model = KMeans(n_clusters=2, random_state=0)#creates a k-means model with skit-learn's kmeans

In [17]:
fitted_model = kmeans_model.fit(cluster_data)#fits the model to our data

In [18]:
fitted_model.labels_ #these are the preditions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1,

In [19]:
cluster_data['2Means Cluster Predictions'] = fitted_model.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


#### Step 4: Inspecting this K-Means model

In [20]:
### Let's look at the first 10 rows
cluster_data.head(n=10)

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,2Means Cluster Predictions
0,18,4,4,2,2,0,4,3,4,1,1,3,6,5,6,6,0
1,17,1,1,1,2,0,5,3,3,1,1,3,4,5,5,6,0
2,15,1,1,1,2,3,4,3,2,2,3,3,10,7,8,10,0
3,15,4,2,1,3,0,3,2,2,1,1,5,2,15,14,15,0
4,16,3,3,1,2,0,4,3,2,1,2,5,4,6,10,10,0
5,16,4,3,1,2,0,5,4,2,1,2,5,10,15,15,15,0
6,16,2,2,1,2,0,4,4,4,1,1,3,0,12,12,11,0
7,17,4,4,2,2,0,4,1,4,1,1,1,6,6,5,6,0
8,15,3,2,1,2,0,4,2,2,1,1,1,0,16,18,19,0
9,15,3,4,1,2,0,5,5,1,1,1,5,0,14,15,15,0


In [21]:
cluster0 = cluster_data[cluster_data['2Means Cluster Predictions'] == 0] #only getting cluster data where this 'cluster_data['2Means Cluster Predictions'] == 0' is true
cluster1 = cluster_data[cluster_data['2Means Cluster Predictions'] == 1]

In [22]:
print(cluster0['G1'].mean(), cluster1['G1'].mean())
print(cluster0['G2'].mean(), cluster1['G2'].mean())
print(cluster0['G3'].mean(), cluster1['G3'].mean())

11.0 10.4375
10.8398791541 10.0625
10.4833836858 10.0625


In [24]:
#grades don't seem to be a big deciding factor, so run a for loop to find the means for all the columns
for column in cluster_data.columns:
    mean0 = cluster0[column].mean()
    mean1 = cluster1[column].mean()
    print("")
    print("Evaluating column: **{c}**".format(c=column))
    print("        Cluster 0 Mean: {m0:.3f}, Cluster 1: {m1:.3f}".format(m0=mean0, m1=mean1))
    print("        Difference in means = {diff:.3f}".format(diff=abs(mean0 - mean1)))
    print("")
    print("-----------------------------------------------------------------")
    
    


Evaluating column: **age**
        Cluster 0 Mean: 16.577, Cluster 1: 17.312
        Difference in means = 0.735

-----------------------------------------------------------------

Evaluating column: **Medu**
        Cluster 0 Mean: 2.719, Cluster 1: 2.906
        Difference in means = 0.187

-----------------------------------------------------------------

Evaluating column: **Fedu**
        Cluster 0 Mean: 2.514, Cluster 1: 2.562
        Difference in means = 0.049

-----------------------------------------------------------------

Evaluating column: **traveltime**
        Cluster 0 Mean: 1.462, Cluster 1: 1.375
        Difference in means = 0.087

-----------------------------------------------------------------

Evaluating column: **studytime**
        Cluster 0 Mean: 2.076, Cluster 1: 1.828
        Difference in means = 0.247

-----------------------------------------------------------------

Evaluating column: **failures**
        Cluster 0 Mean: 0.290, Cluster 1: 0.562
       

#### Step 3: Replicating the with DB Scan

In [28]:
from sklearn.preprocessing import StandardScaler 

In [31]:
X = StandardScaler().fit_transform(cluster_data) #scale all data, so they are on the same scale -
#just a data processing technique

In [32]:
pd.DataFrame(data=X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.023046,1.143856,1.360371,0.792251,-0.042286,-0.449944,0.062194,-0.236010,0.801479,-0.540699,-1.003789,-0.399289,0.036424,-1.782467,-1.254791,-0.964934,-0.439720
1,0.238380,-1.600009,-1.399970,-0.643249,-0.042286,-0.449944,1.178860,-0.236010,-0.097908,-0.540699,-1.003789,-0.399289,-0.213796,-1.782467,-1.520979,-0.964934,-0.439720
2,-1.330954,-1.600009,-1.399970,-0.643249,-0.042286,3.589323,0.062194,-0.236010,-0.997295,0.583385,0.551100,-0.399289,0.536865,-1.179147,-0.722415,-0.090739,-0.439720
3,-1.330954,1.143856,-0.479857,-0.643249,1.150779,-0.449944,-1.054472,-1.238419,-0.997295,-0.540699,-1.003789,1.041070,-0.464016,1.234133,0.874715,1.002004,-0.439720
4,-0.546287,0.229234,0.440257,-0.643249,-0.042286,-0.449944,0.062194,-0.236010,-0.997295,-0.540699,-0.226345,1.041070,-0.213796,-1.480807,-0.190038,-0.090739,-0.439720
5,-0.546287,1.143856,0.440257,-0.643249,-0.042286,-0.449944,1.178860,0.766399,-0.997295,-0.540699,-0.226345,1.041070,0.536865,1.234133,1.140903,1.002004,-0.439720
6,-0.546287,-0.685387,-0.479857,-0.643249,-0.042286,-0.449944,0.062194,0.766399,0.801479,-0.540699,-1.003789,-0.399289,-0.714236,0.329153,0.342338,0.127809,-0.439720
7,0.238380,1.143856,1.360371,0.792251,-0.042286,-0.449944,0.062194,-2.240828,0.801479,-0.540699,-1.003789,-1.839649,0.036424,-1.480807,-1.520979,-0.964934,-0.439720
8,-1.330954,0.229234,-0.479857,-0.643249,-0.042286,-0.449944,0.062194,-1.238419,-0.997295,-0.540699,-1.003789,-1.839649,-0.714236,1.535793,1.939468,1.876199,-0.439720
9,-1.330954,0.229234,1.360371,-0.643249,-0.042286,-0.449944,1.178860,1.768808,-1.896683,-0.540699,-1.003789,1.041070,-0.714236,0.932473,1.140903,1.002004,-0.439720


In [33]:
dbscan_model = DBSCAN(eps=3, min_samples=3) 

In [34]:
fitted_model = dbscan_model.fit(X)

In [37]:
labels = fitted_model.labels_ #getting the predictions, like above with Kmeans

In [38]:
labels

array([ 0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  1,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  2,  0,  0,  0,  2,  0,  3,  0,  0,  0,  0,
        0,  0,  0, -1,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,
        0, -1,  0,  0, -1,  0, -1,  0, -1,  0, -1, -1,  0,  0,  0,  0,  0,
       -1,  0,  0,  2, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  1,  0,
        0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,
        0,  0,  0,  0, -1,  0, -1,  0, -1,  0, -1, -1, -1,  3, -1, -1, -1,
       -1, -1,  0,  0, -1, -1,  0, -1, -1,  0, -1,  0, -1, -1, -1, -1, -1,
       -1,  0,  0, -1, -1,  0,  0,  0, -1, -1,  0, -1,  2,  0,  0,  0,  0,
       -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  3,  0,
        0,  0, -1,  0,  0, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0,  2,
        0, -1, -1, -1,  0,  0,  0,  1,  0,  4, -1,  0, -1,  4,  0,  0,  0,
       -1, -1, -1,  0,  2

#### Step 4: Using sklearn metrics to measure goodness of fit

In [39]:
from sklearn import metrics

In [40]:
# #############################################################################
# Compute DBSCAN Metrics

# Number of clusters in labels, ignoring noise if present.
if -1 in labels:
    n_clusters_ = len(set(labels)) - 1
else:
    n_clusters = len(set(labels))

print('Estimated number of clusters: {n}'.format(n=n_clusters_))
print("Silhouette Coefficient: {:0.3f}".format(metrics.silhouette_score(X, labels)))

Estimated number of clusters: 5
Silhouette Coefficient: 0.007
