In [None]:
!pip -q install supabase torch gliner

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

import torch
from gliner import GLiNER
from supabase import create_client, Client

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MeanShift, OPTICS
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
import pickle
import optuna

import warnings
warnings.filterwarnings('ignore')

# Model

In [None]:


model = GLiNER.from_pretrained("gliner-community/gliner_medium-v2.5")

model.save_pretrained("gliner_Med")
loaded_model = GLiNER.from_pretrained("gliner_Med", load_tokenizer = True, local_files_only=True)


In [None]:
text = """
Libretto by Marius Petipa, based on the 1822 novella ``Trilby, ou Le Lutin d'Argail`` by Charles Nodier, first presented by the Ballet of the Moscow Imperial Bolshoi Theatre on January 25/February 6 (Julian/Gregorian calendar dates), 1870, in Moscow with Polina Karpakova as Trilby and Ludiia Geiten as Miranda and restaged by Petipa for the Imperial Ballet at the Imperial Bolshoi Kamenny Theatre on January 17–29, 1871 in St. Petersburg with Adèle Grantzow as Trilby and Lev Ivanov as Count Leopold.
"""

labels = ["person", "book", "location", "date", "actor", "character"]

entities = loaded_model.predict_entities(text, labels, threshold=0.4)

for entity in entities:
    print(entity["text"], "=>", entity["label"])

In [None]:
# Replace with your Supabase project URL and API key
url: str = "https://alwocqtpmrlfebnjjtct.supabase.co"
key: str = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImFsd29jcXRwbXJsZmVibmpqdGN0Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3MzQ0NTAzMDIsImV4cCI6MjA1MDAyNjMwMn0._NZ3uFepvW-JplnMj8jRhbf5CoT4QMS6lB5OJQaxFu4"

supabase: Client = create_client(url, key)

# Replace with your table name
table_name = "documents"

# Fetch data from Supabase
response = supabase.table(table_name).select("*").execute()

df = pd.DataFrame(response.data)
df.head()

## Data preprocessing

In [None]:
# Assuming 'deadline' and 'uploadedDate' are columns in your DataFrame with datetime values
# Convert the columns to datetime objects if they are not already
df['deadline'] = pd.to_datetime(df['deadline'])
df['uploadedDate'] = pd.to_datetime(df['uploadedDate'])

# Calculate the difference in hours
df['timing'] = (df['deadline'] - df['uploadedDate']).dt.total_seconds() / 3600
df['timing'] = df['timing'].astype(int)
# df['timing']

In [None]:
plagiarism_rule = [40, 50, 60]

no_plagiarism = plagiarism_rule[0]
maybe_plagiarism = plagiarism_rule[1]
plagiarim = plagiarism_rule[2]

In [None]:
df['plagiarism'] = df['plagiarism'].apply(
    lambda row: round(max([v for item in row for v in item.values()]) * 100, 2) if row else 0
)

In [None]:
df

In [None]:
data=df[['sentences', 'page', 'timing', 'plagiarism']]
data.head()

In [None]:
# Scaling Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

# Evaluasi Model
models = {
    'KMeans': KMeans(n_clusters=3, random_state=42),
    'AgglomerativeClustering': AgglomerativeClustering(n_clusters=3),
    'DBSCAN': DBSCAN(eps=1.2, min_samples=5)
}

results = {}

for model_name, model in models.items():
    try:
        labels = model.fit_predict(X_scaled)
        if len(set(labels)) > 1:  # Jika hanya 1 cluster, silhouette_score tidak bisa dihitung
            score = silhouette_score(X_scaled, labels)
            results[model_name] = score
        else:
            results[model_name] = "Invalid (Single Cluster)"
    except Exception as e:
        results[model_name] = f"Error: {str(e)}"

# Menampilkan hasil evaluasi
print('Evaluasi Model:')
for model_name, score in results.items():
    print(f"{model_name}: {score}")

# Simpan Model Terbaik
best_model_name = max(results, key=lambda x: results[x] if isinstance(results[x], float) else -1)

if isinstance(results[best_model_name], float):
    best_model = models[best_model_name]
    best_model.fit(X_scaled)  # Fit model terbaik lagi

    with open('best_model.pkl', 'wb') as file:
        pickle.dump(best_model, file)

    print(f"\nModel terbaik '{best_model_name}' berhasil disimpan sebagai 'best_model.pkl'.")
else:
    print("Tidak ada model yang valid untuk disimpan.")

In [None]:
# Scaling Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

# Definisi Model
models = {
    'KMeans': KMeans(),
    'AgglomerativeClustering': AgglomerativeClustering(),
    'DBSCAN': DBSCAN(),
    'MeanShift': MeanShift(),
    'OPTICS': OPTICS()
}

# Hyperparameter Tuning
params = {
    'KMeans': {'n_clusters': [2, 3, 4, 5]},
    'AgglomerativeClustering': {'n_clusters': [2, 3, 4, 5]},
    'DBSCAN': {'eps': [0.5, 1.0, 1.5], 'min_samples': [5, 10]},
    'MeanShift': {},  # Tidak perlu tuning
    'OPTICS': {'min_samples': [5, 10], 'xi': [0.05, 0.1, 0.2]}
}

results = {}

for model_name, model in models.items():
    if params[model_name]:  # Jika ada hyperparameter yang perlu di-tuning
        grid_search = GridSearchCV(model, params[model_name], scoring='adjusted_rand_score', cv=3, n_jobs=-1)
        grid_search.fit(X_scaled, np.zeros(X_scaled.shape[0]))
        best_model = grid_search.best_estimator_
    else:  # Untuk model tanpa hyperparameter
        best_model = model
        best_model.fit(X_scaled)

    labels = best_model.fit_predict(X_scaled)
    if len(set(labels)) > 1:  # Jika lebih dari 1 cluster
        score = silhouette_score(X_scaled, labels)
        results[model_name] = (score, best_model)
    else:
        results[model_name] = ('Invalid (Single Cluster)', None)

# Menampilkan hasil evaluasi
print('Evaluasi Model:')
for model_name, (score, _) in results.items():
    print(f"{model_name}: {score}")

# Menyimpan model terbaik
best_model_name = max(results, key=lambda x: results[x][0] if isinstance(results[x][0], float) else -1)
best_model = results[best_model_name][1]

if best_model:
    with open('best_model.pkl', 'wb') as file:
        pickle.dump(best_model, file)
    print(f"\nModel terbaik '{best_model_name}' berhasil disimpan sebagai 'best_model.pkl'.")
else:
    print("Tidak ada model yang valid untuk disimpan.")

In [None]:
# prompt: buatkan saya kode untuk membaca pickle dan melakukan prediksi berdasarkan data baru

# Load the saved model
with open('best_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# New data for prediction
new_data = pd.DataFrame({
    'sentences': [100],  # Replace with your actual data
    'page': [5],  # Replace with your actual data
    'timing': [48],  # Replace with your actual data
    'plagiarism': [80.00]  # Replace with your actual data
})

# Preprocess the new data
scaler = StandardScaler()  # Assuming you used StandardScaler for training
X_new_scaled = scaler.fit_transform(new_data)

# Make predictions
prediction = loaded_model.fit_predict(X_new_scaled)

print("Prediction:", prediction)