# This is the workstation for develop best cluster method and number of clusters to use. 

In [1]:
# Import libs
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MeanShift
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch

from sklearn import metrics
from scipy.stats import gaussian_kde
from scipy import stats
from scipy.spatial.distance import cdist, pdist, euclidean
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import collections
%matplotlib inline

In [12]:
test = pd.read_csv('./data/test.csv')
test.shape

(1459, 80)

In [5]:
train = pd.read_csv('./data/train_120feats_Dense_OutlierFree_LogTransform.csv')
test = pd.read_csv('./data/test_119feats_Dense_OutlierFree_LogTransform.csv')

In [27]:
def getPCAcomponent(df, type = "Train", threshold = 0.9):
    """
    Returns returns the PCA fit matrix of the input dataframe, 
    using minimum PCAs that can meet the threshold requirement
    """

    # Check if df is a valid dataframe
    if not isinstance(df, pd.DataFrame):
        raise TypeError('The input is not a valid dataFrame.')

    # Check if the threshold is within 1

    # Make adjustment if necessary, also check input type is valid
    if type == "Train":
        # Remove SalePrice before PCA.
        try:
            df.drop(['SalePrice'], axis = 1, inplace = True)
        except:
            print("There was no 'SalePrice' to drop in the 'train', continue....")
    elif type == "Test":
        pass
    else:
        raise ValueError("Type has to be 'Train' or 'Test'.")

    # Scale the data
    dfToScaleMatrix = preprocessing.scale(df.as_matrix())

    # Perform PCA
    pca = PCA().fit(dfToScaleMatrix)

    # Get incremental and cumulative viariance explained by PCA.
    increVarExpl = pca.explained_variance_ratio_
    totlSumVar = np.array([sum(increVarExpl[0:i+1]) for i,x in enumerate(increVarExpl)]) 

    # Get the num of PCA needed.
    for i, val in enumerate(totlSumVar):
        if val >= threshold:
            numPCA = i
            VarExp = val
            break
    # Present result
    print("We can use {0} PCAs to explain {1:.4f} variance.".format(numPCA, VarExp))
    # Fit transform the df
    result = PCA(n_components=numPCA).fit_transform(dfToScaleMatrix) 
    return result

In [28]:
pcaFitMatrix = getPCAcomponent(train)

We can use 54 PCAs to explain 0.9045 variance.


In [None]:
def getBestClusterNum(pcaFitMatrix, maxCluster=30, method = "KMeans"):
    """
    Returns the best number of clusters to use based on different distancce based scores
    """

    # Num of clusters to consider
    clustersNumList = range(2,maxCluster)

    # Score distance metric
    distanceList = ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']

    # Cluster method
    clusterMethods = {'KMeans': KMeans(),
                      'Affinity propagation': AffinityPropagation(), 
                      'Mean-shift': MeanShift(), 
                      'Spectral clustering': SpectralClustering(),  
                      'Agglomerative clustering': AgglomerativeClustering(), 
                      'DBSCAN': DBSCAN(), 
                      'Gaussian mixtures': GaussianMixture(), 
                      'Birch': Birch()}

    # Create cluster model object
    if method not in clusterMethods:
        raise TypeError("{0} is not a valid clustering method.".format(method))
    else:
        clusterModel = clusterMethods[method]

    # Use the cluster model object to perform clustering
    if method == "KMeans":
        
        fitResult = clusterModel(n_clusters=i).fit(pcaFitMatrix)
    
    
    fit_results_list = []
    for i in clusters:
        fit_result = KMeans(n_clusters=i, init='k-means++', n_init=50, max_iter=2000).fit(full_dataset_after_PCA)
        fit_results_list.append(fit_result)

    # Calculate silhouette score and 
    sil_score_list = []
    calinski_harabaz_score_list = []
    for item in fit_results_list:
        label = item.labels_
        sil_score = metrics.silhouette_score(full_dataset_after_PCA,label, metric='euclidean')
        calinski_harabaz_score = metrics.calinski_harabaz_score(full_dataset_after_PCA,label)  
        sil_score_list.append(sil_score)
        calinski_harabaz_score_list.append(calinski_harabaz_score)

    plt.figure(1)
    plt.plot(clusters, sil_score_list,'o-')
    #plt.axis([1, 28, 0.1, 0.4])
    plt.xlabel('Number of Clusters');
    plt.ylabel('Silhouette Score');

    plt.figure(2)
    plt.plot(clusters, calinski_harabaz_score_list,'o-')
    #plt.axis([1, 28, 20, 50])
    plt.xlabel('Number of Clusters');
    plt.ylabel('Calinski-Harabaz Index');