In [4]:
import pandas as pd, numpy as np, plotly, plotly.express as px, warnings, gc, plotly.offline as plotly_offline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA

# Other settings
gc.enable()
warnings.filterwarnings("ignore")
plotly_offline.init_notebook_mode(connected = True)

In [10]:
# Create a function to automate this
def optimize_k_means(data, verbose):

    # Set variables for the best k, and the best silhouette coefficient
    best_k = 0
    best_sil_coe = 0
    
    # Lists of K values and their scores
    k = []
    score = []
    
    # Loop through 2-8 to check # of clusters
    for i in range(2,8):
        # Create a K Means algorithm
        clusterer = KMeans(n_clusters=i, random_state=np.random.randint(1,100))
        # Fit the algorithm to the data
        cluster_labels = clusterer.fit_predict(data)
        # Find the average silhouette coefficient
        silhouette_avg = silhouette_score(data, cluster_labels)
        
        # If verbose is enabled, print out the average
        if verbose:
            print("For n_clusters =", i,
                "The average silhouette_score is :", silhouette_avg) 

        # If the current is better than the best
        if silhouette_avg > best_sil_coe:
            best_k = i
            best_sil_coe = silhouette_avg
        
        # Append the k and its score
        k.append(i)
        score.append(silhouette_avg)
    
    # Create a dataframe of K values and scores
    df = pd.DataFrame(list(zip(k, score)), columns=['K Value', 'Sil. Coe'])
    
    # Return the best k and silhouette coefficient, and the overall dataframe
    return best_k, best_sil_coe, df

# Function to remove outliers from plotting data
def remove_outliers(df, col, min_quant, max_quant):
    # Lower quantile
    Qx = df[col].quantile(min_quant)
    # Upper quantile
    Qy = df[col].quantile(max_quant)
    # Interquantile range
    IQR = Qy - Qx

    # Return everything in between the IQR
    return df.query('(@Qx - 1.5 * @IQR) <= '+col+' <= (@Qy + 1.5 * @IQR)')

def plot_kmeans(components, dimensions, k, data, classes):
    # Assure we are plotting in 2D or 3D only
    assert dimensions in [2,3]
    
    # Create a PCA transformer with the # of specified components
    pca = PCA(n_components=components)
    
    # Fit that against the raw data
    X = pca.fit_transform(data)
    
    if(dimensions == 3):
        # Create a dataframe containing the PCAs
        dataset = pd.DataFrame({'PCA1': X[:, 0], 'PCA2': X[:, 1], 'PCA3': X[:, 2]})
        
        # Create KMeans Algo
        kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=500, random_state=np.random.randint(1,100))
        # Determine which cluster each point belongs to
        dataset['Cluster'] = kmeans.fit_predict(dataset)
        # Bring distribution of clusters
        print(dataset['Cluster'].value_counts())        
        
        # Append the classes
        dataset['class'] = classes
        # Create the figure
        fig = px.scatter_3d(dataset, x='PCA1', y='PCA2', z='PCA3', color='class', 
                        symbol='Cluster', title="3D K Means Sentiment Class Plot")
        # Show the figure
        fig.show() 
    
    # Otherwise, 2D
    else:
        # Create a dataframe containing the PCAs
        dataset = pd.DataFrame({'PCA1': X[:, 0], 'PCA2': X[:, 1]})
        
        # Create KMeans Algo
        kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=500, random_state=np.random.randint(1,100))
        # Determine which cluster each point belongs to
        dataset['Cluster'] = kmeans.fit_predict(dataset)
        # Bring distribution of clusters
        print(dataset['Cluster'].value_counts())   
        
        # Append the classes
        dataset['class'] = classes
        # Create the figure
        fig = px.scatter(dataset, x='PCA1', y='PCA2', color='class', 
                        symbol='Cluster', title="2D K Means Sentiment Class Plot")
        # Show the figure
        fig.show()     
        
def plot_data(components, dimensions, data, classes):
    # Assure we are plotting in 2D or 3D only
    assert dimensions in [2,3]
    
    # Create a PCA transformer with the # of specified components
    pca = PCA(n_components=components)
    
    # Fit that against the raw data
    X = pca.fit_transform(data)
    
    if(dimensions == 3):
        # Create a dataframe containing the PCAs
        dataset = pd.DataFrame({'PCA1': X[:, 0], 'PCA2': X[:, 1], 'PCA3': X[:, 2]})
        # Append the classes
        dataset['class'] = classes
        # Create the figure
        fig = px.scatter_3d(dataset, x='PCA1', y='PCA2', z='PCA3', color='class', 
                        title="3D Sentiment Class Plot")
        # Show the figure
        fig.show() 
    
    # Otherwise, 2D
    else:
        # Create a dataframe containing the PCAs
        dataset = pd.DataFrame({'PCA1': X[:, 0], 'PCA2': X[:, 1]})
        # Append the classes
        dataset['class'] = classes
        # Create the figure
        fig = px.scatter(dataset, x='PCA1', y='PCA2', color='class', 
                        title="2D Sentiment Class Plot")
        # Show the figure
        fig.show()         

## Non Scaled (i.e. native values, not positive*3)
### Raw Plots

In [9]:
# Import the data built by Iteration 1
boolean = pd.read_csv('Trimmed_Boolean.csv')
tfidf = pd.read_csv('Trimmed_TFIDF.csv')
freq = pd.read_csv('Trimmed_Count.csv')

# Create lists of the dataframes, and their names
dfs = [boolean, tfidf, freq]
names = ['Boolean', 'TFIDF', 'Frequency']

# Grab the classes
classes = boolean['class']

# Drop weird column pandas creates when writing to csv
for df in dfs:
    df.drop(['Unnamed: 0', 'class'], axis=1, inplace=True)
    
# Free up some memory
gc.collect()

0

In [None]:
# For each dataset
for i in range(len(dfs)):
    # Find the best k values
    best_k, best_sil_coe, df = optimize_k_means(dfs[i], True)
    # Pretty print results
    print("For {} data: Best K Value: {} | Best Sil Coe: {}".format(names[i], best_k, best_sil_coe))
    print("{}".format(names[i]))
    print("*---------------------------------------------------------------------------*")
    print(df)

For n_clusters = 2 The average silhouette_score is : 0.1432684590936883
For n_clusters = 3 The average silhouette_score is : 0.002916816827740556
For n_clusters = 4 The average silhouette_score is : -0.004558380974361654
For n_clusters = 5 The average silhouette_score is : -0.02038760167408684
For n_clusters = 6 The average silhouette_score is : -0.029652190928334567
