## Clustering IDA Loans 

#### Importing the necessary dependencies 

In [110]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
np.set_printoptions(suppress=True)
pd.set_option('display.max_columns', 40)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


### Loading the Data

In [111]:
df = pd.read_csv('ida_fe.csv')

In [112]:
df.dtypes


End of Period                    object
Credit Number                    object
Region                           object
Country Code                     object
Country                          object
Borrower                         object
Credit Status                    object
Service Charge Rate             float64
Currency of Commitment           object
Project Name                     object
Original Principal Amount       float64
Cancelled Amount                float64
Undisbursed Amount              float64
Disbursed Amount                float64
Repaid to IDA                   float64
Due to IDA                      float64
Exchange Adjustment               int64
Borrower's Obligation           float64
Sold 3rd Party                  float64
Repaid 3rd Party                float64
Due 3rd Party                   float64
Credits Held                    float64
First Repayment Date             object
Last Repayment Date              object
Agreement Signing Date           object


In [113]:
fig = px.scatter(df, x = df['Board Approval Date'], y=df['Original Principal Amount'], color="Credit Status" )
fig.show()

In [114]:
df['Agreement Signing Date'] = pd.to_datetime(df['Agreement Signing Date'])

In [115]:
X_cluster = df[[                
'Original Principal Amount']]

In [116]:
scaler= StandardScaler()

In [117]:
X_cluster_trans = scaler.fit_transform (X_cluster)

In [118]:
kmeans = KMeans(n_clusters=3, random_state=0)

In [119]:
kmeans.fit(X_cluster)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [120]:
labels = kmeans.labels_

In [121]:
centers = kmeans.cluster_centers_
centers

array([[6.23534013e+07],
       [3.39650096e+08],
       [9.75786803e+08]])

In [122]:
centers_inversed = scaler.inverse_transform(centers)
centers_inversed

array([[1.18310185e+16],
       [6.44456668e+16],
       [1.85147102e+17]])

In [123]:
df['Cluster'] = labels

In [127]:
fig = px.scatter(df, x = df['Board Approval Date'], y=df['Original Principal Amount'], color="Cluster" )

fig.show()


### PCA and k-means clustering on dataset with Plotly

Ref : https://plotly.com/python/v3/ipython-notebooks/baltimore-vital-signs/

In [128]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [129]:
df['Service Charge Rate'] = pd.to_numeric(df['Service Charge Rate'])

df_clus = df[[col for col in df.columns if df[col].dtypes != 'object']].drop('Agreement Signing Date', axis = 1)

In [130]:
df_clus

Unnamed: 0,Service Charge Rate,Original Principal Amount,Cancelled Amount,Undisbursed Amount,Disbursed Amount,Repaid to IDA,Due to IDA,Exchange Adjustment,Borrower's Obligation,Sold 3rd Party,Repaid 3rd Party,Due 3rd Party,Credits Held,Loan Duration in Years,Time to Repay in Years,Region Total,Debt Left in Years,Total Principal Amount,Cluster
0,0.750,19239973.134,115014.515,0.000,19124958.619,19124958.619,0.000,0,0.000,0.000,0.000,0.000,0.000,47.308,3.502,61335.587,829.515,750798904058.809,0
1,0.750,4569493.619,0.000,0.000,4569493.619,4569493.619,0.000,0,0.000,0.000,0.000,0.000,0.000,49.734,7.001,61335.587,829.515,750798904058.809,0
2,0.750,168349764.926,9766899.010,0.000,158582865.916,123017297.801,35565568.116,0,35565568.116,0.000,0.000,0.000,35565568.116,49.819,7.335,18121.996,3299.118,750798904058.809,0
3,0.750,19720972.463,13553.455,0.000,19707419.008,19707419.008,0.000,0,0.000,0.000,0.000,0.000,0.000,49.622,7.291,61335.587,2184.880,750798904058.809,0
4,0.750,60124916.045,3508040.511,0.000,56616875.534,3680072.951,52936802.582,0,52936802.582,0.000,0.000,0.000,52936802.582,49.942,7.751,61335.587,319.243,750798904058.809,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5635,0.750,183741743.434,15788825.560,0.000,179808410.315,98603630.980,76724274.223,0,76724274.223,0.000,0.000,0.000,76724274.223,34.878,12.956,18121.996,4572.001,750798904058.809,0
5636,0.750,721498992.540,104238140.349,0.000,636396373.075,329086013.657,313697668.306,0,313697668.306,0.000,0.000,0.000,313697668.306,34.963,13.041,18121.996,4572.001,750798904058.809,2
5637,0.750,656564083.212,128300360.081,0.000,538571797.395,160995745.849,388805285.845,0,388805285.845,0.000,0.000,0.000,388805285.845,39.538,17.873,18121.996,5157.521,750798904058.809,1
5638,0.750,206108212.202,9455388.453,0.000,201822042.531,98805483.622,104130992.698,0,104130992.698,0.000,0.000,0.000,104130992.698,34.897,13.917,18121.996,4572.001,750798904058.809,1


In [131]:
X = np.array(df_clus)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [132]:
pca = PCA()
pca.fit(X_scaled)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [133]:
len(pca.components_)

19

In [134]:
print('Explained Variance Ratio = ', sum(pca.explained_variance_ratio_[: 2]))

Explained Variance Ratio =  0.48341825184577025


In [135]:
fig = px.bar(y=pca.explained_variance_ratio_)

fig.show()

In [136]:
fig = px.bar(y=np.cumsum(pca.explained_variance_ratio_))

fig.show()

In [137]:
#So we need about 9 dimensions to explain ~90% of the total variance.

In [138]:
pca.n_components = 2
X_reduced = pca.fit_transform(X_scaled)
df_clus_X_reduced = pd.DataFrame(X_reduced, index=df_clus.index)

In [139]:
fig = px.scatter(x = df_clus_X_reduced[0],
                 y=df_clus_X_reduced[1])
fig.show()

In [140]:
# Let the number of clusters be a parameter, so we can get a feel for an appropriate
# value thereof.
def cluster(n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(X_reduced)
    Z = kmeans.predict(X_reduced)
    return kmeans, Z

In [141]:
max_clusters = 20

In [142]:
inertias = np.zeros(max_clusters)

for i in range(1, max_clusters):
    kmeans, Z = cluster(i)
    inertias[i] = kmeans.inertia_

In [143]:
fig = px.scatter(x=range(1, max_clusters),
            y=inertias[1:], title='Number of clusters')
fig.show()

#### 5 seems to be the optimal number for clusters here

In [144]:
n_clusters = 5
model, Z = cluster(n_clusters)

In [145]:
fig = px.scatter(x=df_clus_X_reduced[0],
                     y=df_clus_X_reduced[1],
                                       color=Z,
)

fig.show()