# Clustering, Producing Classifier, Calculating Cluster Distance Table

This notebook contains the application of K-Means to cluster Recipient and Grant-Writer data, calculating distances between pairs of Recipient and Grant-Writer cluster-centroids, followed by training of classifiers on the clustered data.

In [33]:
import warnings
warnings.simplefilter('ignore')

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
import pickle
import math

In [35]:
# read recipient data
recep = pd.read_csv("kiva_loans.csv//recep_data.csv")
recep.head()

Unnamed: 0,loan_amount,Services,Clothing,Transportation,Personal Use,Retail,Health,Food,Education,Manufacturing,Wholesale,Construction,Arts,Agriculture,Entertainment,Housing,male,female
0,150.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,200.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,225.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,250.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,250.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [36]:
# read lender data
lender = pd.read_csv("kiva_loans.csv//lender_data.csv")
lender.head()

Unnamed: 0,loan_amount,Services,Clothing,Transportation,Personal Use,Retail,Health,Food,Education,Manufacturing,Wholesale,Construction,Arts,Agriculture,Entertainment,Housing,male,female
0,675.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,325.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,1650.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
3,700.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,325.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [37]:
# apply k-means to form clusters
# n_clusters becomes 10x when len(recep) becomes 10x
n_clusters = 10**int(math.log(len(recep))/math.log(1000))
kmeans_recep = KMeans(n_clusters = n_clusters).fit(np.array(recep))
kmeans_lender = KMeans(n_clusters = n_clusters).fit(np.array(lender))
recep['label'] = kmeans_recep.labels_
lender['label'] = kmeans_lender.labels_
pd.DataFrame(kmeans_recep.labels_, columns=['labels']).to_csv("kiva_loans.csv\\recep_labels.csv", index=False)
pd.DataFrame(kmeans_lender.labels_, columns=['labels']).to_csv("kiva_loans.csv\\lender_labels.csv", index=False)

In [38]:
# calculate distances between all clusters and save them in a 2d numpy array 
cluster_dist_tab = np.empty([n_clusters, n_clusters])
for i in range(n_clusters):
    for j in range(n_clusters):
        df1 = recep[recep['label'] == i].loc[:, recep.columns != 'label'].mean()
        df2 = lender[lender['label'] == j].loc[:, lender.columns != 'label'].mean()
        cluster_dist_tab[i, j] = np.linalg.norm(np.array(df1) - np.array(df2))
print(cluster_dist_tab)
np.save(open("kiva_loans.csv//cluster_dist_tab.npy", "wb"), cluster_dist_tab)

[[4.14495963e+01 1.93719241e+03 2.42141532e+02 4.48757107e+03
  8.92201078e+02 2.89947587e+03 7.98891443e+01 4.87470463e+02
  1.36197596e+03 2.29090448e+03]
 [2.36962351e+03 3.90981693e+02 2.08603301e+03 2.15939722e+03
  1.43597324e+03 5.71301952e+02 2.24828492e+03 1.84070458e+03
  9.66198130e+02 3.72698953e+01]
 [9.36590438e+02 1.04205179e+03 6.52999709e+02 3.59243061e+03
  2.94353687e+00 2.00433536e+03 8.15251864e+02 4.07671340e+02
  4.66835363e+02 1.39576391e+03]
 [4.19076301e+03 2.21212124e+03 3.90717257e+03 3.38257789e+02
  3.25711282e+03 1.24983767e+03 4.06942442e+03 3.66184412e+03
  2.78733770e+03 1.85840911e+03]
 [4.85789205e+02 1.49285368e+03 2.02198610e+02 4.04323244e+03
  4.47862238e+02 2.45513721e+03 3.64450799e+02 4.31308889e+01
  9.17637239e+02 1.84656578e+03]
 [3.04165886e+03 1.06301704e+03 2.75806838e+03 1.48736184e+03
  2.10800862e+03 1.00733606e+02 2.92032026e+03 2.51273994e+03
  1.63823350e+03 7.09304910e+02]
 [1.27206873e+04 1.07420455e+04 1.24370969e+04 8.19166682e

In [39]:
# Create classifier for recipients
clf_loaner = RandomForestClassifier()
clf_loaner.fit(recep.loc[:, recep.columns != 'label'], kmeans_recep.labels_)
pickle.dump(clf_loaner, open("kiva_loans.csv//clf_loaner.p", "wb"))

In [40]:
# create classifier for lenders
clf_lender = RandomForestClassifier()
clf_lender.fit(lender.loc[:, lender.columns != 'label'], kmeans_lender.labels_)
pickle.dump(clf_lender, open("kiva_loans.csv//clf_lender.p", "wb"))