In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [None]:
df = pd.read_csv('events.csv')
df.head()

In [None]:
df.info()

In [None]:
#Konversi kolom waktu
df['event_time'] = pd.to_datetime(df['event_time'], errors='coerce')

df.head()

In [None]:
df.info()

In [None]:
numerik = df.select_dtypes(include = ["number"]).columns.tolist()
kategori = df.select_dtypes(exclude = ["number"]).columns.tolist()

df[numerik].head()

In [None]:
df.describe()

In [None]:
df.describe(include = 'object')

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

##### Nilai kosong pada kolom category_code dan brand tidak dihapus karena merepresentasikan kondisi nyata pada data e-commerce, sehingga diisi dengan label unknown. Sementara itu, nilai kosong pada kolom user_session dihapus karena jumlahnya sangat kecil dan berpotensi mengganggu analisis berbasis sesi.

In [None]:
df["category_code"] = df["category_code"].fillna("unknown")
df["brand"] = df["brand"].fillna("unknown")

df = df.dropna(subset=["user_session"])
df.head()

In [None]:
#IQR
Q1 = df[numerik].quantile(0.25)
Q3 = df[numerik].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

#Filter baris outliner
df_outlier = df[((df[numerik] < lower_bound) | (df[numerik] > upper_bound)).any(axis=1)]
print('Data outlier: ', df_outlier.shape[0], 'baris')

##### Berdasarkan metode IQR, sejumlah data teridentifikasi sebagai outlier. Namun data tersebut tidak dihapus karena masih merepresentasikan nilai transaksi yang valid dan relevan dalam analisis perilaku pengguna e-commerce.

## Feature Engineering

In [None]:
#1. Hitung aktivitas user 

user_activity = df.groupby("user_id").agg(
    total_events=("event_type", "count"),
    n_view=("event_type", lambda x: (x == "view").sum()),
    n_cart=("event_type", lambda x: (x == "cart").sum()),
    n_remove=("event_type", lambda x: (x == "remove_from_cart").sum()),
    n_purchase=("event_type", lambda x: (x == "purchase").sum())
).reset_index()

user_activity.head()

In [None]:
#2. Fitur harga (perilaku belanja) 

user_price = df.groupby("user_id").agg(
    avg_price=("price", "mean"),
    max_price=("price", "max"),
    total_spent=("price", lambda x: x[df.loc[x.index, "event_type"] == "purchase"].sum())
).reset_index()

user_price.head()

In [None]:
#3. Durasi dan intensitas user 

user_time = df.groupby("user_id").agg(
    first_event=("event_time", "min"),
    last_event=("event_time", "max"),
    n_sessions=("user_session", "nunique")
).reset_index()

user_time["active_duration_hours"] = (
    (user_time["last_event"] - user_time["first_event"])
    .dt.total_seconds() / 3600
)

user_time.head()

In [None]:
#4. Gabung semua fitur 

user_df = (
    user_activity
    .merge(user_price, on="user_id", how="left")
    .merge(user_time[["user_id", "active_duration_hours", "n_sessions"]], 
           on="user_id", how="left")
)

user_df.head()

## EDA 

In [None]:
#1. Distribusi Total Aktivitas User 

plt.figure()
user_df["total_events"].hist(bins=50)
plt.xlim(1, 50)
plt.title("Distribusi Total Events per User (1â€“50)")
plt.xlabel("Total Events")
plt.ylabel("Jumlah User")
plt.show()

In [None]:
#2. Correlation Heatmap


plt.figure(figsize=(10, 8))

corr = user_df.drop(columns=["user_id"]).corr()

sns.heatmap(
    corr,
    annot=True,
    fmt=".2f",
    cmap="coolwarm"
)

plt.title("Correlation Heatmap Fitur User")
plt.show()

In [None]:
#drop n_remove -> variansi rendah
#drop n_view dengan total_events -> korelasi sangat tinggi 
#drop avg_price dengan max_price -> korelasi tinggi 

features = user_df.drop(
    columns=["user_id", "n_remove", "n_view", "avg_price", "max_price"],
    errors="ignore"
)
features.head()

plt.figure(figsize=(8, 6))

corr_final = features.corr()

sns.heatmap(
    corr_final,
    annot=True,
    fmt=".2f",
    cmap="coolwarm"
)

plt.title("Correlation Heatmap Fitur User (Setelah Feature Selection)")
plt.show()

In [None]:
df.to_csv('events_fixx.csv', index=False)

In [None]:
## User level aggregation & clustering model

## Pemodelan

In [None]:
features = user_df.drop(columns=["user_id"])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2) #Mengambil 2 komponen utama
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)

In [None]:
pcs = pca.transform(X_scaled)

df_pca = pd.DataFrame(data = pcs, columns = ['PC 1', 'PC 2'])
df_pca.head()

In [None]:
pca.explained_variance_ratio_

In [None]:
#Scree Plot (menentukan jumlah komponen PCA)

import matplotlib.pyplot as plt

pca_full = PCA()
pca_full.fit(X_scaled)

expl_var = pca_full.explained_variance_ratio_

plt.figure(figsize=(8,4))
plt.plot(range(1, len(expl_var)+1), expl_var, marker='o')
plt.xlabel("Principal Component")
plt.ylabel("Explained Variance Ratio")
plt.title("Scree Plot")
plt.show()


In [None]:
#Elbow Method (menentukan jumlah cluster k)

from sklearn.cluster import KMeans

inertias = []
K = range(1, 11)

for k in K:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    inertias.append(km.inertia_)

plt.figure(figsize=(6,4))
plt.plot(K, inertias, marker='o')
plt.xlabel("Jumlah Cluster (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

user_df["cluster"] = clusters
user_df.head()

In [None]:
#cek jumlah anggota per cluster
user_df["cluster"].value_counts()

In [None]:
#interpretasi cluster
cluster_profile = user_df.groupby("cluster").mean()
cluster_profile

In [None]:
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

df_vis = user_df.copy()
df_vis["PC1"] = X_pca_2d[:, 0]
df_vis["PC2"] = X_pca_2d[:, 1]

In [None]:
df_pca = pd.DataFrame(X_pca_2d, columns=["PC 1", "PC 2"])
df_pca["clusters"] = clusters

fig, ax = plt.subplots(figsize=(15,5))
sns.scatterplot(x="PC 1", y="PC 2", hue="clusters", data=df_pca, s=60, ax=ax)
plt.title("Visualisasi Cluster (PCA 2D)")
plt.show()

In [None]:
#Silhouette Score
from sklearn.metrics import silhouette_score

sil_score = silhouette_score(X_scaled, user_df["cluster"])
sil_score

print("Silhouette Score:", sil_score)

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

sample_idx = user_df.sample(n=10000, random_state=42).index
X_sample = X_scaled[sample_idx]

k_values = []
silhouette_values = []

for k in range(2, 8):
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = kmeans.fit_predict(X_sample)
    score = silhouette_score(X_sample, labels)

    k_values.append(k)
    silhouette_values.append(score)

df_scores = pd.DataFrame({
    "k": k_values,
    "Silhouette Score": silhouette_values
})

df_scores

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
user_df["cluster"] = kmeans.fit_predict(X_scaled)

user_df["cluster"].value_counts()

In [None]:
#interpretasi cluster 
cluster_profile = user_df.groupby("cluster").mean()
cluster_profile.round(2)

#### Modeling dilakukan menggunakan fitur numerik hasil feature engineering yang merepresentasikan perilaku user, bukan kolom mentah atau kategorik, karena KMeans berbasis jarak dan membutuhkan representasi numerik yang meaningful.