In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("./data/Indian_Student_Data.csv")
df.head()

Unnamed: 0,country,no_of_indian_students,percentage,latitude,longitude,code
0,United States of America,165918,37.134985,39.78373,-100.445882,USA
1,Australia,66886,14.970109,-24.776109,134.755,AUS
2,Canada,50000,11.190764,61.066692,-107.991707,CAN
3,New Zealand,32000,7.162089,-41.500083,172.834408,NZL
4,Bahrain,27000,6.043013,35.207801,72.547397,BHR


In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

In [3]:
kmeans = KMeans(n_clusters=7, init='k-means++')
y_kmeans = kmeans.fit_predict(df[['latitude', 'longitude', 'percentage']])
y_kmeans

array([5, 6, 5, 6, 3, 3, 5, 1, 2, 2, 3, 1, 3, 3, 1, 3, 1, 1, 1, 2, 4, 3,
       1, 2, 2, 1, 2, 2, 1, 1, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1,
       5, 3, 1, 3, 4, 1, 0, 2, 2, 1, 3, 1, 1, 1, 5, 1, 1, 5, 0, 2, 1])

In [4]:
print("Clusters found in {0} iterations".format(kmeans.n_iter_))

Clusters found in 2 iterations


In [5]:
silhouette_avg = silhouette_score(df[['latitude', 'longitude', 'percentage']], y_kmeans)
print(silhouette_avg)

0.5199857921316744


In [6]:
df = pd.concat([df, pd.DataFrame({'cluster':y_kmeans})], axis=1)
df.head()

Unnamed: 0,country,no_of_indian_students,percentage,latitude,longitude,code,cluster
0,United States of America,165918,37.134985,39.78373,-100.445882,USA,5
1,Australia,66886,14.970109,-24.776109,134.755,AUS,6
2,Canada,50000,11.190764,61.066692,-107.991707,CAN,5
3,New Zealand,32000,7.162089,-41.500083,172.834408,NZL,6
4,Bahrain,27000,6.043013,35.207801,72.547397,BHR,3


In [7]:
import plotly.plotly as py
import plotly
import pandas as pd
plotly.tools.set_credentials_file(username='janhavi.p.kulkarni', api_key="YhlrqzA9GUVL39ubrIEg")

data = [dict(
        type = 'choropleth',
        locations = df['code'],
        z = df['cluster'],
        text = df['country'],
        autocolorscale = False,
        colorscale = [[0,"rgb(255,0 , 0)"],[1,"rgb(0, 255, 0)"], [2,"rgb(0, 0, 255)"], 
                     [3,"rgb(255, 255, 0)"], [4,"rgb(255, 0, 255)"], [5,"rgb(0, 255, 255)"], 
                     [6,"rgb(127, 255, 255)"]],
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 1) ),
        colorbar = dict(
            autotick = False,
            tickprefix = 'Cluster Number ',
            title = 'Cluster Color')
      )]

layout = dict(
    title = 'Indian Student Distribution',
    geo = dict(
        showframe = True,
        showcoastlines = True
    )
)

fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='Indian Student Distribution Around the World' )