In [None]:

import os

In [None]:

df.head()

In [None]:
os.environ["OMP_NUM_THREADS"] = '1'
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
import os 
# dbscan


def find_optimal_number_of_clusters(X, max_clusters):
    distortions = []
    storage_path = os.path.join('static', 'uploads')
    fig, ax = plt.subplots(figsize=(12, 8))
    for i in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(X)
        distortions.append(kmeans.inertia_)
    kn = KneeLocator(range(1, max_clusters + 1), distortions, curve='convex', direction='decreasing')
    plt.plot(range(1, max_clusters + 1), distortions, marker='o')
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
    plt.savefig(os.path.join(storage_path, 'elbow_method.png'), dpi=150, bbox_inches='tight')
    return kn.knee


def preprocess_data(X):
    imputer = SimpleImputer(strategy='mean')
    scaler = StandardScaler()
    pca = PCA(n_components=2)
    X_imputed = imputer.fit_transform(X)
    X_scaled = scaler.fit_transform(X_imputed)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca

def plot_clusters(X, labels, centers, title):
    storage_path = os.path.join('static', 'uploads')
    fig, ax = plt.subplots(figsize=(12, 8))
    plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
    plt.title(title)
    # remove spline
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.savefig(os.path.join(storage_path, title + '.png'), dpi=150, bbox_inches='tight')

def kmeans_clustering(X, n_clusters=2):
    X_pca = preprocess_data(X)
    print(X_pca.shape)
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X_pca)
    centers = kmeans.cluster_centers_
    labels = kmeans.labels_
    plot_clusters(X_pca, labels, centers, 'KMeans Clustering')
    return labels, centers


def fuzzy_cmeans_clustering(X, n_clusters):
    X_pca = preprocess_data(X)
    gmm = GaussianMixture(n_components=n_clusters)
    gmm.fit(X_pca)
    labels = gmm.predict(X_pca)
    centers = gmm.means_
    plot_clusters(X_pca, labels, centers, 'Fuzzy CMeans Clustering')
    return labels, centers

def dbscan_clustering(X, min_samples=10, eps=0.3):
    storage_path = os.path.join('static', 'uploads')
    X_pca = preprocess_data(X)
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X_pca)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    unique_labels = set(labels)
    core_samples_mask = np.zeros_like(labels, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True

    colors = [plt.cm.Spectral(each)
              for each in np.linspace(0, 1, len(unique_labels))]
    fig, ax = plt.subplots(figsize=(12, 8))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            col = [0, 0, 0, 1]
        class_member_mask = (labels == k)
        xy = X_pca[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                 markeredgecolor='k', markersize=10)
        xy = X_pca[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
                 markeredgecolor='k', markersize=6)

    plt.title('DBSCAN Clustering')
    plt.savefig(os.path.join(storage_path, 'DBSCAN Clustering.png'), dpi=150, bbox_inches='tight')
    return labels, n_clusters_, n_noise_

df = pd.read_csv(r'../static\uploads\imdb_clean.csv')
columns = df.select_dtypes(include='number').columns
X = df[columns].values
n = find_optimal_number_of_clusters(X, 10)
kmeans_clustering(X, n)
fuzzy_cmeans_clustering(X, n)
dbscan_clustering(X, 10, 0.3)