# Clustering, Producing Classifier, Calculating Cluster Distance Table

This notebook contains the application of K-Means to cluster Recipient and Grant-Writer data, calculating distances between pairs of Recipient and Grant-Writer cluster-centroids, followed by training of classifiers on the clustered data.

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
import pickle
import math

In [3]:
# read recipient data
recep = pd.read_csv("kiva_loans.csv//recep_data.csv")
recep.head()

Unnamed: 0,loan_amount,Services,Clothing,Transportation,Personal Use,Retail,Health,Food,Education,Manufacturing,Wholesale,Construction,Arts,Agriculture,Entertainment,Housing,male,female
0,150.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,200.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,225.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,250.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,250.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [4]:
# read lender data
lender = pd.read_csv("kiva_loans.csv//lender_data.csv")
lender.head()

Unnamed: 0,loan_amount,Services,Clothing,Transportation,Personal Use,Retail,Health,Food,Education,Manufacturing,Wholesale,Construction,Arts,Agriculture,Entertainment,Housing,male,female
0,675.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,325.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,1650.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
3,700.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,325.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [5]:
# apply k-means to form clusters
# n_clusters becomes 10x when len(recep) becomes 10x
n_clusters = 10**int(math.log(len(recep))/math.log(1000))
kmeans_recep = KMeans(n_clusters = n_clusters).fit(np.array(recep))
pickle.dump(kmeans_recep, open("kiva_loans.csv//kmeans_recep.p", "wb"))
kmeans_lender = KMeans(n_clusters = n_clusters).fit(np.array(lender))
pickle.dump(kmeans_lender, open("kiva_loans.csv//kmeans_lender.p", "wb"))
recep['label'] = kmeans_recep.labels_
lender['label'] = kmeans_lender.labels_
pd.DataFrame(kmeans_recep.labels_, columns=['labels']).to_csv("kiva_loans.csv\\recep_labels.csv", index=False)
pd.DataFrame(kmeans_lender.labels_, columns=['labels']).to_csv("kiva_loans.csv\\lender_labels.csv", index=False)

In [6]:
# calculate distances between all clusters and save them in a 2d numpy array 
cluster_dist_tab = np.empty([n_clusters, n_clusters])
for i in range(n_clusters):
    for j in range(n_clusters):
        df1 = recep[recep['label'] == i].loc[:, recep.columns != 'label'].mean()
        df2 = lender[lender['label'] == j].loc[:, lender.columns != 'label'].mean()
        cluster_dist_tab[i, j] = np.linalg.norm(np.array(df1) - np.array(df2))
print(cluster_dist_tab)
np.save(open("kiva_loans.csv//cluster_dist_tab.npy", "wb"), cluster_dist_tab)

[[9.37883792e+02 9.93330207e+02 6.70954501e+02 2.00304201e+03
  4.08964694e+02 4.59023428e+02 3.59113726e+03 8.24301013e+02
  1.37030389e+03 4.23579295e+00]
 [4.14495963e+01 1.88976419e+03 2.25480019e+02 2.89947587e+03
  4.87470463e+02 1.35545738e+03 4.48757107e+03 7.21333525e+01
  2.26673782e+03 8.92201078e+02]
 [2.59838129e+03 6.67167699e+02 2.33145221e+03 3.42544176e+02
  2.06946237e+03 1.20147451e+03 1.93063943e+03 2.48479848e+03
  2.90194014e+02 1.66473103e+03]
 [2.12016043e+03 1.88946842e+02 1.85323132e+03 8.20765052e+02
  1.59124149e+03 7.23253617e+02 2.40886032e+03 2.00657762e+03
  1.88026973e+02 1.18651013e+03]
 [4.85789205e+02 1.44542545e+03 2.18860066e+02 2.45513721e+03
  4.31308889e+01 9.11118658e+02 4.04323244e+03 3.72206583e+02
  1.82239912e+03 4.47862238e+02]
 [4.29523270e+03 2.36401917e+03 4.02830368e+03 1.35430737e+03
  3.76631382e+03 2.89832599e+03 2.33788159e+02 4.18164990e+03
  1.98704547e+03 3.36158252e+03]
 [1.27206873e+04 1.07894738e+04 1.24537583e+04 9.77976201e

In [7]:
# # Create classifier for recipients
# clf_loaner = RandomForestClassifier()
# clf_loaner.fit(recep.loc[:, recep.columns != 'label'], kmeans_recep.labels_)
# pickle.dump(clf_loaner, open("kiva_loans.csv//clf_loaner.p", "wb"))

In [8]:
# # create classifier for lenders
# clf_lender = RandomForestClassifier()
# clf_lender.fit(lender.loc[:, lender.columns != 'label'], kmeans_lender.labels_)
# pickle.dump(clf_lender, open("kiva_loans.csv//clf_lender.p", "wb"))