# Predicting Geographic Clusters using Population

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd

#import Kmeans algorithm from sklearn
from sklearn.cluster import KMeans

#import scatter plot from adspy_shared_utilities
from adspy_shared_utilities import plot_labelled_scatter

### Reading Population Data Set

In [None]:
cl = pd.read_csv('clustering.csv', header=0)
cl.head()

### Applying K-means clustering (Assuming only 1 variable)

In [None]:
X = cl["Total Pop. Est. - Jul-1-2001"]
y = cl['State']

X_data = X.values.reshape(-1, 1)

kmeans = KMeans(n_clusters = 4)
kmeans.fit(X_data)
kmeans.cluster_centers_

### Normalization (Scaling) of Variables 

In [None]:
#Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

#independent variables
X = cl[["Total Pop. Est. - Jul-1-2001","Net Domestic Mig.","Federal/Civilian move from abroad","Net Int. Migration","Period Births","Period Deaths","< 65 Pop. Est.", "> 65 Pop. Est."]].as_matrix()
#dependent variable
y = cl['State']

X_scaled = scaler.fit_transform(X)
X_scaled

### Applying K-means clustering with k = 4

In [None]:
#Setting no. of clusters to be 4
kmeans = KMeans(n_clusters = 4)
#Applying k-means clustering to data
kmeans.fit(X_scaled)

#Plotting the clusters
plot_labelled_scatter(X, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])
kmeans.cluster_centers_

In [None]:
Clusters = pd.Series(kmeans.labels_+1)
Population = pd.DataFrame(cl["Total Pop. Est. - Jul-1-2001"])
States = pd.DataFrame(y)
States.insert(1, 'Cluster', Clusters)
States.insert(1, 'Population', Population)
States

### Output data to CSV

In [None]:
States.to_csv('output.csv')