In [1]:
# !pip install numpy pandas pygments matplotlib matplotlib-inline seaborn torch gliner supabase scikit-learn scipy yellowbrick hdbscan optuna

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import os

import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import optuna

import torch
from gliner import GLiNER
from supabase import create_client, Client

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans, BisectingKMeans, DBSCAN, OPTICS, MeanShift, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA

from scipy.cluster.hierarchy import linkage, dendrogram
from yellowbrick.cluster import KElbowVisualizer
from hdbscan import HDBSCAN
import hdbscan.prediction

import warnings
warnings.filterwarnings('ignore')

# NER Model

In [3]:
# model = GLiNER.from_pretrained("gliner-community/gliner_medium-v2.5")

# model.save_pretrained("gliner_Med")
# loaded_model = GLiNER.from_pretrained("gliner_Med", load_tokenizer = True, local_files_only=True)

In [4]:
# text = """
# Libretto by Marius Petipa, based on the 1822 novella ``Trilby, ou Le Lutin d'Argail`` by Charles Nodier, first presented by the Ballet of the Moscow Imperial Bolshoi Theatre on January 25/February 6 (Julian/Gregorian calendar dates), 1870, in Moscow with Polina Karpakova as Trilby and Ludiia Geiten as Miranda and restaged by Petipa for the Imperial Ballet at the Imperial Bolshoi Kamenny Theatre on January 17–29, 1871 in St. Petersburg with Adèle Grantzow as Trilby and Lev Ivanov as Count Leopold.
# """

# labels = ["person", "book", "location", "date", "actor", "character"]

# entities = loaded_model.predict_entities(text, labels, threshold=0.4)

# for entity in entities:
#     print(entity["text"], "=>", entity["label"])

# Embedding

In [5]:
# from pinecone import Pinecone
# from langchain_pinecone import PineconeEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.document_loaders import PyPDFLoader
# from sklearn.metrics.pairwise import cosine_similarity

# Clustering

## Read Database

In [6]:
url: str = "https://alwocqtpmrlfebnjjtct.supabase.co"
key: str = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImFsd29jcXRwbXJsZmVibmpqdGN0Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3MzQ0NTAzMDIsImV4cCI6MjA1MDAyNjMwMn0._NZ3uFepvW-JplnMj8jRhbf5CoT4QMS6lB5OJQaxFu4"
supabase: Client = create_client(url, key)

table_name = "documents"
response = supabase.table(table_name).select("*").execute()

## Dataset

In [None]:
df = pd.DataFrame(response.data)
df.head()

## Exploratory Data Analysis

### Dataset Shape

In [None]:
print("DF Shape:", df.shape)

### Dataset Information

In [None]:
print(df.info())

### Descriptive Statistics

In [10]:
cat_cols = df.select_dtypes(include='object').columns.tolist()
num_cols = df.select_dtypes(include='number').drop(columns=['target'], errors='ignore').columns.tolist()

In [None]:
print(df[num_cols].describe().T)

### Missing Value

In [None]:
print("\nMissing values per column:")
print(df.isnull().sum())

df = df.dropna(subset=['NRP'])
print(df.isnull().sum())

### Feature Distribution

In [None]:
n_cols = len(num_cols)

n_rows = (n_cols + 1) // 2
fig, axes = plt.subplots(n_rows, 2, figsize=(12, 4 * n_rows))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    ax = axes[i]
    sns.kdeplot(df[col], ax=ax, fill=True, color='orange')
    ax.set_title(f"Distribusi Numerik: {col}")
    ax.legend()

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

### Correlation Matrix

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

## Data Preprocessing

### Type Convert

In [15]:
df['deadline'] = pd.to_datetime(df['deadline'])
df['uploadedDate'] = pd.to_datetime(df['uploadedDate'])

df['timing'] = (df['deadline'] - df['uploadedDate']).dt.total_seconds() / 3600
df['timing'] = df['timing'].astype(int)

### Plagiarism Value

In [16]:
plagiarism_rule = [40, 50, 60]

no_plagiarism = plagiarism_rule[0]
maybe_plagiarism = plagiarism_rule[1]
plagiarim = plagiarism_rule[2]

In [17]:
df['plagiarism'] = df['plagiarism'].apply(
    lambda row: round(max([v for item in row for v in item.values()]) * 100, 2) if row else 0
)

In [None]:
df

### New Dataframe

In [None]:
data=df[['sentences', 'page', 'timing', 'plagiarism']]
print(data.head())

In [None]:
n_cols = len(data.columns.tolist())

n_rows = (n_cols + 1) // 2
fig, axes = plt.subplots(n_rows, 2, figsize=(12, 4 * n_rows))
axes = axes.flatten()

for i, col in enumerate(data.columns.tolist()):
    ax = axes[i]
    sns.kdeplot(data[col], ax=ax, fill=True)
    ax.set_title(f"Distribusi Numerik: {col}")
    ax.legend()

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

### Data Scalling

In [22]:
features = data.columns.tolist()

In [None]:
scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(data)
X_scaled = pd.DataFrame(X_scaled, columns=features)

print(X_scaled.head())

### Data Weighting

In [None]:
# weights = np.array([0.2, 0.3, 2.0, 2.5])     # 0.68 (10 cluster)
# weights = np.array([0.15, 0.25, 2.0, 2.77])  # 0.71 (10 cluster) - 0.44 (3 cluster)
# weights = np.array([0.1, 0.15, 1.0, 5.0])    # 0.76 (3 cluster)
# weights = np.array([0.3, 0.3, 1.5, 5.0])     # 0.67 (3 cluster)
weights = np.array([0.5, 0.5, 1.5, 4.5])       # 0.62 (3 cluster)
# weights = np.array([0.3, 0.4, 2.0, 4.0])     # 0.52 (3 cluster)
# weights = np.array([0.5, 0.7, 2.5, 3.5])     # 0.43 (2 cluster)
# weights = np.array([1.0, 1.2, 3.0, 3.5])     # 0.39 (3 cluster)

X_weight = X_scaled * weights
X_weight = pd.DataFrame(X_weight, columns=features)

print(X_weight.head())

In [None]:
n_cols = len(features)

n_rows = (n_cols + 1) // 2
fig, axes = plt.subplots(n_rows, 2, figsize=(12, 4 * n_rows))
axes = axes.flatten()

for i, col in enumerate(features):
    ax = axes[i]
    sns.kdeplot(X_weight[col], ax=ax, fill=True)
    ax.set_title(f"Distribusi Numerik: {col}")
    ax.legend()

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(X_weight.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

## Clustering Model

### Optimal Cluster

In [None]:
def explore_optimal_clusters(X_weight, max_clusters=10):
    # K-means elbow method
    plt.figure(figsize=(20, 5))
    
    plt.subplot(1, 3, 1)
    visualizer = KElbowVisualizer(KMeans(random_state=42), k=(2, min(max_clusters, X_weight.shape[0]-1)))
    visualizer.fit(X_weight)
    visualizer.finalize()
    
    # Silhouette analysis
    plt.subplot(1, 3, 2)
    silhouette_scores = []
    for k in range(2, min(max_clusters, X_weight.shape[0]-1)+1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X_weight)
        try:
            score = silhouette_score(X_weight, labels)
            silhouette_scores.append(score)
        except:
            silhouette_scores.append(0)
    
    plt.plot(range(2, min(max_clusters, X_weight.shape[0]-1)+1), silhouette_scores, marker='o')
    plt.title('Silhouette Score vs Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    
    # Hierarchical clustering dendrogram
    plt.subplot(1, 3, 3)
    
    Z = linkage(X_weight, 'ward')
    dendrogram(Z)
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Sample index')
    plt.ylabel('Distance')
    
    plt.tight_layout()
    plt.show()
    
    if len(silhouette_scores) > 0:
        optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2
        return optimal_k
    return 3  # Default if analysis fails

suggested_clusters = 3
# suggested_clusters = explore_optimal_clusters(X_weight, max_clusters=100)
print(f"Suggested optimal number of clusters based on analysis: {suggested_clusters}")

### Optuna Objective

In [28]:
def objective_kmeans(trial):
    n_clusters = trial.suggest_int('n_clusters', 2, 3)
    # n_clusters = trial.suggest_int('n_clusters', 2, 10)
    init_method = trial.suggest_categorical('init', ['k-means++', 'random'])
    n_init = trial.suggest_int('n_init', 1, 10)
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'full', 'elkan'])
    random_state = trial.suggest_int('random_state', 0, 1000)
    
    model = KMeans(
        n_clusters=n_clusters, 
        init=init_method, 
        n_init=n_init,
        max_iter=max_iter,
        algorithm=algorithm,
        random_state=random_state
    )
    
    try:
        labels = model.fit_predict(X_weight)
        if len(set(labels)) <= 1:  # Check if all samples in same cluster
            return -1.0
        score = silhouette_score(X_weight, labels)
        return score
    except Exception as e:
        print(f"Error in KMeans: {e}")
        return -1.0

def objective_bisecting_kmeans(trial):
    n_clusters = trial.suggest_int('n_clusters', 2, 3)
    # n_clusters = trial.suggest_int('n_clusters', 2, 10)
    init = trial.suggest_categorical('init', ['k-means++', 'random'])
    n_init = trial.suggest_int('n_init', 1, 10)
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    random_state = trial.suggest_int('random_state', 0, 1000)

    model = BisectingKMeans(
        n_clusters=n_clusters,
        init=init,
        n_init=n_init,
        max_iter=max_iter,
        random_state=random_state
    )

    try:
        labels = model.fit_predict(X_weight)
        if len(set(labels)) <= 1:
            return -1.0
        score = silhouette_score(X_weight, labels)
        return score
    except Exception as e:
        print(f"Error in BisectingKMeans: {e}")
        return -1.0


def objective_gmm(trial):
    n_components = trial.suggest_int('n_components', 2, 3)
    # n_components = trial.suggest_int('n_components', 2, 10)
    covariance_type = trial.suggest_categorical('covariance_type', ['full', 'tied', 'diag', 'spherical'])
    init_params = trial.suggest_categorical('init_params', ['kmeans', 'random'])
    random_state = trial.suggest_int('random_state', 0, 1000)
    
    model = GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        init_params=init_params,
        random_state=random_state
    )
    
    try:
        labels = model.fit_predict(X_weight)
        if len(set(labels)) <= 1:
            return -1.0
        return silhouette_score(X_weight, labels)
    except Exception as e:
        print(f"Error in GMM: {e}")
        return -1.0

def objective_hdbscan(trial):
    min_cluster_size = trial.suggest_int('min_cluster_size', 5, 50)
    min_samples = trial.suggest_int('min_samples', 2, 10)
    
    model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, prediction_data=True)
    
    try:
        labels = model.fit_predict(X_weight)
        unique_labels = set(labels)
        if len(unique_labels) <= 1 or (len(unique_labels) == 2 and -1 in unique_labels):
            return -1.0
        
        # Filter noise points jika diperlukan
        mask = labels != -1
        if np.sum(mask) <= 1:
            return -1.0
        
        score = silhouette_score(X_weight[mask], labels[mask])
        return score
    except Exception as e:
        print(f"Error in HDBSCAN: {e}")
        return -1.0

def objective_dbscan(trial):
    eps = trial.suggest_float('eps', 0.1, 2.0, log=True)
    min_samples = trial.suggest_int('min_samples', 2, 10)
    
    model = DBSCAN(eps=eps, min_samples=min_samples)
    
    try:
        labels = model.fit_predict(X_weight)
        unique_labels = set(labels)
        
        # Check if useful clusters were formed
        if len(unique_labels) <= 1 or (len(unique_labels) == 2 and -1 in unique_labels):
            return -1.0
            
        # Handle noise points for silhouette score calculation
        if -1 in unique_labels:
            # Filter out noise points
            mask = labels != -1
            if sum(mask) <= 1:  # Not enough non-noise points
                return -1.0
            filtered_data = X_weight[mask]
            filtered_labels = labels[mask]
            score = silhouette_score(filtered_data, filtered_labels)
        else:
            score = silhouette_score(X_weight, labels)
            
        # Adjust score based on ratio of noise points
        if -1 in unique_labels:
            noise_ratio = np.sum(labels == -1) / len(labels)
            if noise_ratio > 0.5:  # If more than 50% points are noise
                score *= (1 - noise_ratio)  # Penalize for excessive noise
                
        return score
    except Exception as e:
        print(f"Error in DBSCAN: {e}")
        return -1.0

def objective_optics(trial):
    min_samples = trial.suggest_int('min_samples', 2, 10)
    xi = trial.suggest_float('xi', 0.01, 0.3)
    min_cluster_size = trial.suggest_float('min_cluster_size', 0.05, 0.2)
    cluster_method = trial.suggest_categorical('cluster_method', ['xi', 'dbscan'])
    
    model = OPTICS(
        min_samples=min_samples,
        xi=xi,
        min_cluster_size=min_cluster_size,
        cluster_method=cluster_method
    )
    
    try:
        labels = model.fit_predict(X_weight)
        unique_labels = set(labels)
        
        # Check if useful clusters were formed
        if len(unique_labels) <= 1 or (len(unique_labels) == 2 and -1 in unique_labels):
            return -1.0
            
        # Handle noise points for silhouette score calculation
        if -1 in unique_labels:
            # Filter out noise points
            mask = labels != -1
            if sum(mask) <= 1:  # Not enough non-noise points
                return -1.0
            filtered_data = X_weight[mask]
            filtered_labels = labels[mask]
            score = silhouette_score(filtered_data, filtered_labels)
        else:
            score = silhouette_score(X_weight, labels)
            
        # Adjust score based on ratio of noise points
        if -1 in unique_labels:
            noise_ratio = np.sum(labels == -1) / len(labels)
            if noise_ratio > 0.5:  # If more than 50% points are noise
                score *= (1 - noise_ratio)  # Penalize for excessive noise
                
        return score
    except Exception as e:
        print(f"Error in OPTICS: {e}")
        return -1.0

def objective_meanshift(trial):
    bandwidth = trial.suggest_float('bandwidth', 0.3, 2.0)
    bin_seeding = trial.suggest_categorical('bin_seeding', [True, False])
    cluster_all = trial.suggest_categorical('cluster_all', [True, False])
    
    model = MeanShift(
        bandwidth=bandwidth,
        bin_seeding=bin_seeding,
        cluster_all=cluster_all
    )
    
    try:
        labels = model.fit_predict(X_weight)
        n_clusters = len(set(labels))
        
        if n_clusters <= 1:
            return -1.0
        if n_clusters >= X_weight.shape[0] - 1:  # Too many clusters
            return -1.0
            
        score = silhouette_score(X_weight, labels)
        
        # Slightly penalize for too many clusters
        if n_clusters > 10:
            score *= (1 - (n_clusters - 10) * 0.01)
            
        return score
    except Exception as e:
        print(f"Error in MeanShift: {e}")
        return -1.0

def objective_agglomerative(trial):
    n_clusters = trial.suggest_int('n_clusters', 2, 10)
    linkage = trial.suggest_categorical('linkage', ['ward', 'complete', 'average', 'single'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'l1', 'l2', 'manhattan'])
    
    if linkage == 'ward' and metric != 'euclidean':
        metric = 'euclidean'
        
    
    model = AgglomerativeClustering(
        n_clusters=n_clusters, 
        linkage=linkage,
        metric=metric
    )
    
    try:
        labels = model.fit_predict(X_weight)
        if len(set(labels)) <= 1:
            return -1.0
        score = silhouette_score(X_weight, labels)
        return score
    except Exception as e:
        print(f"Error in AgglomerativeClustering: {e}")
        return -1.0

### Optuna Optimization

In [None]:
algorithms = {
    'KMeans': objective_kmeans,
    'BisectingKMeans': objective_bisecting_kmeans,
    'GaussianMixture': objective_gmm,
    # 'HDBSCAN': objective_hdbscan,
    # 'DBSCAN': objective_dbscan,
    # 'OPTICS': objective_optics,
    # 'MeanShift': objective_meanshift,
    # 'AgglomerativeClustering': objective_agglomerative,
}

n_trials = 200
results = {}
best_params = {}

print("\n--- Running Hyperparameter Optimization with Optuna ---")
for algo_name, objective in algorithms.items():
    print(f"\nOptimizing {algo_name}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    results[algo_name] = study.best_value
    best_params[algo_name] = study.best_params
    
    print(f"Best parameters for {algo_name}: {study.best_params}")
    print(f"Best silhouette score: {study.best_value:.4f}")

best_models = {}
evaluation_results = {}

### Model Evaluation

In [None]:
def compute_sse(X, labels, centroids):
    sse = 0.0
    for i in range(len(centroids)):
        cluster_points = X[labels == i]
        if len(cluster_points) == 0:
            continue
        cluster_points = np.array(cluster_points)
        centroid = np.array(centroids[i]).reshape(1, -1)
        sse += np.sum(np.square(cluster_points - centroid))
    return float(sse) 

evaluation_results = {}
best_models = {}

for algo_name, params in best_params.items():
    if algo_name == 'KMeans':
        model = KMeans(**params)
    elif algo_name == 'BisectingKMeans':
        model = BisectingKMeans(**params)
    elif algo_name == 'GaussianMixture':
        model = GaussianMixture(**params)
    elif algo_name == 'HDBSCAN':
        model = HDBSCAN(**params)
    elif algo_name == 'DBSCAN':
        model = DBSCAN(**params)
    elif algo_name == 'OPTICS':
        model = OPTICS(**params)
    elif algo_name == 'MeanShift':
        model = MeanShift(**params)
    elif algo_name == 'AgglomerativeClustering':
        model = AgglomerativeClustering(**params)

    best_models[algo_name] = model

    try:
        model.fit(X_weight)
        
        try:
            labels = model.labels_
        except:
            labels = model.fit_predict(X_weight)

        labels = np.array(labels)
        unique_labels = set(labels)
        n_clusters = len(unique_labels)

        if -1 in unique_labels:
            n_clusters -= 1
            noise_ratio = np.sum(labels == -1) / len(labels)
            non_noise_mask = labels != -1
            X_eval = X_weight[non_noise_mask]
            labels_eval = labels[non_noise_mask]
        else:
            noise_ratio = 0
            X_eval = X_weight
            labels_eval = labels

        if len(set(labels_eval)) <= 1:
            print(f"{algo_name}: Invalid clustering (insufficient non-noise points)")
            evaluation_results[algo_name] = {
                'silhouette': -1,
                'calinski_harabasz': -1,
                'davies_bouldin': -1,
                'sse': -1,
                'model': model,
                'n_clusters': n_clusters,
                'noise_ratio': noise_ratio
            }
            continue

        # Hitung metrik evaluasi
        sil_score = silhouette_score(X_eval, labels_eval)
        ch_score = calinski_harabasz_score(X_eval, labels_eval)
        db_score = davies_bouldin_score(X_eval, labels_eval)

        # SSE
        if algo_name == 'GaussianMixture':
            # Get predictions and means
            labels = model.fit_predict(X_eval)
            centroids = model.means_
            
            # Calculate metrics using numpy arrays
            labels = np.array(labels)
            X_eval_array = np.array(X_eval)
            
            # Calculate evaluation metrics
            sil_score = silhouette_score(X_eval_array, labels)
            ch_score = calinski_harabasz_score(X_eval_array, labels)
            db_score = davies_bouldin_score(X_eval_array, labels)
            
            # Calculate SSE
            sse = compute_sse(X_eval_array, labels, centroids)
            
            evaluation_results[algo_name] = {
                'silhouette': float(sil_score),
                'calinski_harabasz': float(ch_score),
                'davies_bouldin': float(db_score),
                'sse': float(sse),
                'model': model,
                'n_clusters': params['n_components'],
                'noise_ratio': 0.0
            }
            
            print(f"{algo_name}\nSilhouette={sil_score:.4f}, Calinski-Harabasz={ch_score:.1f}, Davies-Bouldin={db_score:.2f}, SSE={sse:.2f}, Clusters={n_clusters}")
                
        else:
            if hasattr(model, "inertia_"):
                sse = model.inertia_
            elif hasattr(model, "cluster_centers_"):
                sse = compute_sse(X_eval, labels_eval, model.cluster_centers_)
            else:
                centroids = np.array([X_eval[labels_eval == i].mean(axis=0) for i in np.unique(labels_eval)])
                sse = compute_sse(X_eval, labels_eval, centroids)

            print(f"{algo_name}\nSilhouette={sil_score:.4f}, Calinski-Harabasz={ch_score:.1f}, Davies-Bouldin={db_score:.2f}, SSE={sse:.2f}, Clusters={n_clusters}, Noise={noise_ratio*100:.1f}%\n")

            evaluation_results[algo_name] = {
                'silhouette': sil_score,
                'calinski_harabasz': ch_score,
                'davies_bouldin': db_score,
                'sse': sse,
                'model': model,
                'n_clusters': n_clusters,
                'noise_ratio': noise_ratio
            }

    except Exception as e:
        print(f"{algo_name}: Error in evaluation - {str(e)}")
        evaluation_results[algo_name] = {
            'silhouette': -1,
            'calinski_harabasz': -1,
            'davies_bouldin': -1,
            'sse': -1,
            'model': model,
            'n_clusters': 0,
            'noise_ratio': 0
        }

In [None]:
sns.set_theme(style="whitegrid")

results_df = pd.DataFrame([
    {
        'Algorithm': algo,
        'Silhouette': metrics['silhouette'],
        'Calinski-Harabasz': metrics['calinski_harabasz'],
        'Davies-Bouldin': metrics['davies_bouldin'],
        'SSE': metrics['sse'],
        'Clusters': metrics['n_clusters'],
        'Noise Ratio': metrics['noise_ratio'] * 100
    }
    for algo, metrics in evaluation_results.items()
])

metrics_to_plot = ['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin', 'SSE']
n_metrics = len(metrics_to_plot)

fig, axes = plt.subplots(1, n_metrics, figsize=(5 * n_metrics, 6), sharey=False)

palette = sns.color_palette("husl", len(results_df['Algorithm']))
algo_colors = dict(zip(results_df['Algorithm'], palette))

for i, metric in enumerate(metrics_to_plot):
    ax = axes[i]
    
    sns.barplot(
        data=results_df,
        x='Algorithm',
        y=metric,
        ax=ax,
        palette=[algo_colors[algo] for algo in results_df['Algorithm']]
    )
    
    ax.set_title(f'{metric} Score', fontsize=14)
    ax.set_xlabel('')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)

plt.suptitle('Clustering Evaluation Metrics by Algorithm', fontsize=16, y=1.05)
plt.tight_layout()
plt.show()


### Best Clustering Model

In [None]:
valid_models = {k: v for k, v in evaluation_results.items() if v['silhouette'] > 0}
if valid_models:
    best_algo = max(valid_models, key=lambda x: valid_models[x]['silhouette'])
    best_score = valid_models[best_algo]['silhouette']
    best_model = valid_models[best_algo]['model']
    n_clusters = valid_models[best_algo]['n_clusters']
    
    print(f"\nBest clustering algorithm: {best_algo}")
    print(f"Best parameters: {best_params[best_algo]}")
    print(f"Silhouette score: {best_score:.4f}")
    print(f"Number of clusters: {n_clusters}")
    
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_weight)
    
    if best_algo == 'GaussianMixture':
        cluster_labels = best_model.predict(X_weight)
    else:
        cluster_labels = best_model.labels_
    
    plt.figure(figsize=(10, 8))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', s=50)
    plt.title(f'Clusters by {best_algo} (PCA projection)')
    plt.colorbar(label='Cluster')
    plt.show()
    
    if best_algo in ['KMeans', 'BisectingKMeans']:
        centroids = best_model.cluster_centers_
        feature_importance = np.std(centroids, axis=0)
        feature_importance = feature_importance / np.sum(feature_importance)

        print("\nFeature importance for clustering:")
        for i, feature in enumerate(features):
            print(f"{feature}: {feature_importance[i]:.4f}")
        
        plt.figure(figsize=(10, 6))
        plt.bar(features, feature_importance)
        plt.title('Feature Importance for Clustering')
        plt.xlabel('Features')
        plt.ylabel('Importance')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
else:
    print("\nNo valid clustering models found with positive silhouette scores.")
    print("Consider revisiting your feature engineering or preprocessing steps.")

## Save & Load Model

In [None]:
# Save Best Model
if 'best_model' in locals() and best_score > 0:
    with open('best_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
    print("Best model saved as 'best_model.pkl'")

In [34]:
# Load Saved Best Model
with open('best_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [None]:
# New Dataframe for prediction
new_data = pd.DataFrame({
    'sentences': [113], 
    'page': [13], 
    'timing': [30], 
    'plagiarism': [74.00]
})

X_new_scaled = scaler.transform(new_data)
X_new_scaled = pd.DataFrame(X_new_scaled, columns=features)

X_new_weight = X_new_scaled * weights
X_new_weight = pd.DataFrame(X_new_weight, columns=features)

print(X_new_weight)

In [None]:
# K-Means/GMM prediction
prediction = loaded_model.predict(X_new_weight)
print("Prediction:", prediction)

# HDBSCAN prediction
# labels, strengths = hdbscan.prediction.approximate_predict(loaded_model, new_data)
# print("Label:", labels)
# print("Cluster strengths:", strengths)

# All other models is not supported for prediction except KMeans, Bisecting KMeans, GMM, and HDBSCAN

# Large Language Model

In [37]:
# from langgraph.graph import StateGraph, END
# from langchain.schema import SystemMessage, HumanMessage
# from langchain_groq import ChatGroq

# # Inisialisasi Groq LLM
# llm = ChatGroq(temperature=0, model_name="llama3-8b-8192")

# # Node 1: input
# def input_node(state):
#     return state

# # Node 2: jawab
# def answer_node(state):
#     user_question = state["question"]
#     response = llm([
#         SystemMessage(content="Kamu adalah asisten AI."),
#         HumanMessage(content=user_question)
#     ])
#     return {"answer": response.content}

# # Bangun graph
# graph_builder = StateGraph()

# graph_builder.add_node("InputNode", input_node)
# graph_builder.add_node("AnswerNode", answer_node)
# graph_builder.set_entry_point("InputNode")
# graph_builder.add_edge("InputNode", "AnswerNode")
# graph_builder.add_edge("AnswerNode", END)

# # Compile graph
# graph = graph_builder.compile()

# # Generate gambar graph
# graph.get_graph().draw("langgraph_simple.png", format="png", prog="dot")

# print("✅ Diagram graph telah disimpan sebagai langgraph_simple.png")

# # Jalankan graph (opsional)
# state = {"question": "Apa itu LangGraph?"}
# result = graph.invoke(state)
# print("\nJawaban:")
# print(result["answer"])