In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from tslearn.clustering import TimeSeriesKMeans
from sklearn.metrics import silhouette_score
import pickle

In [27]:
df = pd.read_csv('../data_new/train_data_v3.csv')
df = df.drop(columns='label')

# Convert the time variable to a numerical format
df['time'] = pd.to_datetime(df['time']).astype(int) // 10**9

# One-hot encode the location variable
df = pd.get_dummies(df, columns=['Measure'])

# Normalize the kWh values
scaler = StandardScaler()
df['kWh_norm'] = scaler.fit_transform(df['kWh'].values.reshape(-1, 1))

# Set the input variables for clustering
X = np.hstack((df['time'].values.reshape(-1, 1), df.drop(['time', 'kWh'], axis=1).values))

# Determine the number of clusters
n_clusters = 4

# Cluster the data using TimeSeriesKMeans
model = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw")
model.fit(X)

# Get the cluster assignments
cluster_labels = model.labels_

# Evaluate the clustering using silhouette score
silhouette_avg = silhouette_score(X, cluster_labels)

# Print the silhouette score
print(f"Silhouette score: {silhouette_avg:.3f}")


KeyboardInterrupt: 

In [22]:
df2 = df.copy()
df2['label'] = cluster_labels
print(df2['label'].value_counts())

NameError: name 'cluster_labels' is not defined

In [25]:
with open('../models/cluster-model.pkl', 'wb') as f:
    pickle.dump(model, f)