# Clustering

In [6]:


# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline 
from __future__ import print_function
# Seaborn for easier visualization
import seaborn as sns

# Scikit-Learn for Modeling
import sklearn

# Scikit-Learn's make_pipeline function
from sklearn.pipeline import make_pipeline 

# Scikit-Learn's StandardScaler
from sklearn.preprocessing import StandardScaler

# Scikit-Learn's KMeans algorithm (added later)

from sklearn.cluster import KMeans

In [7]:
# Load cleaned dataset from training Module 1
df = pd.read_csv('train.csv')
print(df.shape)
df.head()
#df.drop(['Duration'], axis=1, inplace=True)

(1018, 13)


Unnamed: 0,AcctType,ExecFlag,CreditCardFlag,AutoBillFlag,15_Shops,15_Spend,16_Shops,16_Spend,Duration,Age,GMiss,Gender_1.0,Gender_2.0
0,0,0,0,0,2.0,369.58,1.0,63.45,168.02,63.588,1,0,0
1,0,0,0,0,9.0,2205.18,8.0,2240.88,60.026,45.006,1,0,0
2,0,0,0,0,23.0,2334.16,5.0,403.52,23.984,35.168,1,0,0
3,1,1,0,0,11.0,1808.86,2.0,341.02,99.977,60.007,0,0,1
4,1,0,0,0,5.0,1145.72,4.0,376.13,188.029,49.501,1,0,0


In [8]:
#import package for Kmeans

from sklearn.cluster import KMeans
from numpy import array

def hartigan_K(list_of_tuples, threshold = 12):
    # 'list_of_tuples' is a list containing the points you want to cluster
    # 'threshold' optimizes goodness of fit values
    #     Hartigan recommends setting threshold to 10, but Chiang & Mirkin confirm up to 12
    # returns integer, "correct" number of clusters
    
    inertia_list = np.zeros(len(list_of_tuples)+1) # initializes for maximum possible clusters
    num = 0                                     # counter
    H_Rule = threshold+1                        # simply initializes above threshold to meet 'while' condition
    
    # NOTE: 'inertia' is equivalent to the sum of within-cluster distances to centroids
    
    while num < len(list_of_tuples) and H_Rule > threshold:
        kmn = KMeans(n_clusters = num+1)
        kmn.fit(list_of_tuples)
        inertia_list[num+1]+=kmn.inertia_
        if num > 0:
            H_Rule = ((float(inertia_list[num])/inertia_list[num+1])-1)*(len(list_of_tuples)-(num)-1)
        num+=1
    
    if H_Rule > threshold:
      num+=1
    # NOTE: if while-loop reaches the number of K-Means clusters equal to the length of list_of_tuples
    # without hitting the threshold, then function returns trivial solution that there are N clusters
    # (where N is the number of points under observation)
    
    return num-1

In [9]:
# create tuple from original dataframe
tuple_list = [tuple(x) for x in df.to_records(index=False)]

In [10]:
# get number of cluster
num_of_clusters = print(hartigan_K(tuple_list))

31


In [11]:
#drop any NaN value
df = df.dropna(how='any')
print(df.shape)
k_means= KMeans(n_clusters= 35, random_state=123)
kmeans = k_means.fit(df)
# Centroid values
centroids = k_means.cluster_centers_

(1018, 13)


In [12]:
# create a dataframe to store cluster value
df1 = pd.DataFrame()
df1['cluster']=k_means.predict(df)
df1.head()

Unnamed: 0,cluster
0,0
1,2
2,11
3,29
4,19


In [13]:
df.head()

Unnamed: 0,AcctType,ExecFlag,CreditCardFlag,AutoBillFlag,15_Shops,15_Spend,16_Shops,16_Spend,Duration,Age,GMiss,Gender_1.0,Gender_2.0
0,0,0,0,0,2.0,369.58,1.0,63.45,168.02,63.588,1,0,0
1,0,0,0,0,9.0,2205.18,8.0,2240.88,60.026,45.006,1,0,0
2,0,0,0,0,23.0,2334.16,5.0,403.52,23.984,35.168,1,0,0
3,1,1,0,0,11.0,1808.86,2.0,341.02,99.977,60.007,0,0,1
4,1,0,0,0,5.0,1145.72,4.0,376.13,188.029,49.501,1,0,0


In [14]:
#get K-nn package for 

from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(centroids)
distances, indices = nbrs.kneighbors(df.values)
df_radius = pd.DataFrame()
#print((indices))

# get cluster and distance value
df_radius['cluster'] = [x[0]for x in indices.tolist()]
df_radius['distance'] = [x[0]for x in distances.tolist()]

print(df_radius.head())
df_radius.describe()

   cluster  distance
0        0   136.353
1        2   486.356
2       11   259.737
3       29   140.670
4       19   200.508


Unnamed: 0,cluster,distance
count,1018.0,1018.0
mean,14.723,272.049
std,10.454,156.976
min,0.0,0.0
25%,7.0,172.418
50%,11.0,238.368
75%,23.0,328.354
max,34.0,1245.45


In [15]:
# df_

df_radius_final = pd.DataFrame()
df_radius_temp = pd.DataFrame()
df_radius_temp = df_radius.groupby(['cluster'])['distance'].max()
#df_radius_final.columns = ['cluster', 'radius']
df_radius_final['cluster'] = range(35)
df_radius_final['radius'] = [group for group in df_radius_temp.values]
df_radius_temp = df_radius.groupby(['cluster'])['distance'].count()
df_radius_final['count'] = [group for group in df_radius_temp.values]
#print(df_radius_final)

In [16]:
## TESTING
dTest = pd.read_csv('test.csv')
print(dTest.shape)
dTest.head()

(7046, 13)


Unnamed: 0,AcctType,ExecFlag,CreditCardFlag,AutoBillFlag,16_Shops,16_Spend,17_Shops,17_Spend,Duration,Age,GMiss,Gender_1.0,Gender_2.0
0,0,0,0,0,1.0,161.62,1.0,29.63,8.082,38.588,1,0,0
1,0,0,0,0,22.0,2972.14,28.0,3321.32,11.083,46.169,1,0,0
2,1,1,0,0,4.0,1029.76,5.0,658.99,31.587,54.755,0,0,1
3,1,1,0,0,6.0,938.82,5.0,876.55,14.672,39.253,1,0,0
4,0,0,0,1,3.0,1068.61,5.0,1660.71,26.005,58.588,1,0,0


In [17]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(centroids)
distances, indices = nbrs.kneighbors(dTest.values)

df_predict = pd.DataFrame()
#print((indices))

df_predict['cluster'] = [x[0]for x in indices.tolist()]
df_predict['distance'] = [x[0]for x in distances.tolist()]

print(df_predict.head())


   cluster  distance
0        0   182.615
1       32   574.248
2       34   262.371
3       34   159.088
4        8   437.493


In [18]:
df_result = pd.DataFrame()
df_result = pd.merge(df_radius_final, df_predict, on='cluster', how='inner')

# Mixing confidacne and association for novel class detection
df_result['score'] =  ((df_result['distance'] - df_result['radius']) * df_result['count'])/7046
df_result.head()

Unnamed: 0,cluster,radius,count,distance,score
0,0,369.267,143,182.615,-3.788
1,0,369.267,143,124.757,-4.962
2,0,369.267,143,188.05,-3.678
3,0,369.267,143,286.69,-1.676
4,0,369.267,143,275.684,-1.899


In [19]:
df_result.describe()

Unnamed: 0,cluster,radius,count,distance,score
count,7046.0,7046.0,7046.0,7046.0,7046.0
mean,16.469,552.496,35.007,978.818,-0.943
std,10.659,289.964,44.536,2345.886,1.82
min,0.0,0.0,1.0,67.5,-9.827
25%,8.0,456.847,5.0,269.649,-1.565
50%,13.0,617.085,16.0,490.497,-0.286
75%,27.0,700.273,52.0,907.022,0.174
max,34.0,1245.45,159.0,73329.781,10.407


In [20]:
import pandas as pd
from sklearn import preprocessing

# Create x, where x the 'scores' column's values as floats
x = df_result[['score']].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.StandardScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_normalized = pd.DataFrame(x_scaled)

In [21]:
df_normalized.head()

Unnamed: 0,0
0,-1.563
1,-2.208
2,-1.503
3,-0.403
4,-0.525


In [22]:
df_normalized.describe()

Unnamed: 0,0
count,7046.0
mean,0.0
std,1.0
min,-4.881
25%,-0.342
50%,0.361
75%,0.613
max,6.236


In [23]:
dTestResult = pd.read_csv('test.csv')
dTestResult['Score'] = df_normalized[0]


In [24]:
dTestResult['Renewal'] = dTestResult.Score > 0
dTestResult.head()

Unnamed: 0,AcctType,ExecFlag,CreditCardFlag,AutoBillFlag,16_Shops,16_Spend,17_Shops,17_Spend,Duration,Age,GMiss,Gender_1.0,Gender_2.0,Score,Renewal
0,0,0,0,0,1.0,161.62,1.0,29.63,8.082,38.588,1,0,0,-1.563,False
1,0,0,0,0,22.0,2972.14,28.0,3321.32,11.083,46.169,1,0,0,-2.208,False
2,1,1,0,0,4.0,1029.76,5.0,658.99,31.587,54.755,0,0,1,-1.503,False
3,1,1,0,0,6.0,938.82,5.0,876.55,14.672,39.253,1,0,0,-0.403,False
4,0,0,0,1,3.0,1068.61,5.0,1660.71,26.005,58.588,1,0,0,-0.525,False


In [25]:
## SAVE RESULT
dTestResult.to_csv('result.csv', index=None)
