In [10]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.layers import LeakyReLU
import numpy as np
import nltk
from nltk.corpus import stopwords
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU
from tensorflow.keras.losses import Huber
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import hstack

data = pd.read_csv("data/updatemergedata.csv")
data.head()
label_encoder = LabelEncoder()
data['Level'] = label_encoder.fit_transform(data['Level']) 
data['Price'] = data['Price'].map({'Berbayar': 1, 'Gratis': 0})
# 1. Stopwords dan TF-IDF dari teks
indonesian_stopwords = stopwords.words('indonesian')
data['Combined Summary'] = data['Learning Path'] + ' ' + data['Learning Path Summary'] + ' ' + data['Course Name_x'] + ' ' + data['Course Summary']
tfidf = TfidfVectorizer(stop_words=indonesian_stopwords)
tfidf_matrix = tfidf.fit_transform(data['Combined Summary']).toarray()  # Ubah ke array agar bisa digunakan
# 2. Scaling data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(tfidf_matrix)

# 1. Memproses data numerik dengan Autoencoder
def create_autoencoder_model(input_dim):
    input_layer = tf.keras.layers.Input(shape=(input_dim,))
    encoded = Dense(128, kernel_regularizer=tf.keras.regularizers.l2(0.01))(input_layer)
    encoded = LeakyReLU(negative_slope=0.01)(encoded)
    encoded = Dropout(0.3)(encoded)
    encoded = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(0.01))(encoded)
    encoded = LeakyReLU(negative_slope=0.01)(encoded)
    encoded = Dropout(0.3)(encoded)
    encoded = Dense(32, kernel_regularizer=tf.keras.regularizers.l2(0.01))(encoded)
    
    decoded = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(0.01))(encoded)
    decoded = LeakyReLU(negative_slope=0.01)(decoded)
    decoded = Dense(128, kernel_regularizer=tf.keras.regularizers.l2(0.01))(decoded)
    decoded = LeakyReLU(negative_slope=0.01)(decoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)
    
    autoencoder = tf.keras.models.Model(input_layer, decoded)
    return autoencoder

# Latih Autoencoder dan dapatkan encoder
def train_and_get_encoder(X_scaled):
    input_dim = X_scaled.shape[1]
    autoencoder = create_autoencoder_model(input_dim)
    optimizer = Adam(learning_rate=0.001)
    autoencoder.compile(optimizer=optimizer, loss=Huber())
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    autoencoder.fit(
        X_scaled, X_scaled,
        epochs=50,
        batch_size=64,
        shuffle=True,
        validation_split=0.2,
        callbacks=[early_stopping]
    )
    
    # Ambil bagian encoder dari Autoencoder
    encoder = tf.keras.models.Model(inputs=autoencoder.input, outputs=autoencoder.layers[-4].output)
    return autoencoder, encoder

    # Mengambil representasi dari layer encoded
    encoder = tf.keras.models.Model(inputs=autoencoder.input, outputs=autoencoder.layers[-4].output)
    return encoder.predict(X_scaled)

# 2. Memproses data teks dengan TF-IDF
def preprocess_learning_paths(data):
    data['combined_features'] = data.apply(lambda x: f"{x['Learning Path']} {x['Learning Path Summary']} {x['Course Name_x']}", axis=1)
    return data
    
def encode_with_autoencoder(data, autoencoder):
    encoder = tf.keras.models.Model(inputs=autoencoder.input, outputs=autoencoder.layers[-4].output)  # Layer sebelum bagian decoding
    return encoder.predict(data)

def vectorize_learning_paths(data):
    vectorizer = TfidfVectorizer()
    return vectorizer.fit_transform(data['combined_features'])

# 3. Menggabungkan hasil Autoencoder dan TF-IDF
def combine_features(numeric_embeddings, tfidf_embeddings):
    # Gabungkan numeric embeddings dan tf-idf embeddings menjadi satu fitur
    combined = hstack([numeric_embeddings, tfidf_embeddings])  # Menyatu dengan cara yang benar
    return combined

# 4. Clustering dan rekomendasi
def cluster_and_recommend(data, user_preferences):
    # Proses data teks
    data = preprocess_learning_paths(data)
    
    # Dapatkan embedding dari encoder yang dilatih
    numeric_features = data.select_dtypes(include=[np.number])
    
    # Latih autoencoder dan ambil encoder
    autoencoder, encoder = train_and_get_encoder(X_scaled)
    
    # Gunakan encoder untuk mendapatkan fitur numerik
    numeric_embeddings = encoder.predict(numeric_features.values)
    
    # Ekstraksi fitur teks dengan TF-IDF
    tfidf_vectors = vectorize_learning_paths(data)
    
    # Gabungkan fitur numerik dan teks
    combined_features = combine_features(numeric_embeddings, tfidf_vectors)
    
    # Clustering
    num_clusters = 5
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    clusters = kmeans.fit_predict(combined_features)
    data['cluster'] = clusters
    
    # Rekomendasi
    user_vector = " ".join(user_preferences)
    vectorizer = TfidfVectorizer()
    all_vectors = vectorizer.fit_transform(data['combined_features'].tolist() + [user_vector])
    user_vector_tfidf = all_vectors[-1]
    
    # Gabungkan fitur numerik dan TF-IDF untuk user_vector
    user_combined_features = combine_features(numeric_embeddings, all_vectors[:-1])
    
    similarity_scores = cosine_similarity(user_vector_tfidf, user_combined_features)
    data['similarity'] = similarity_scores.flatten()
    
    recommendations = data.sort_values(by='similarity', ascending=False).head(3)
    
    return recommendations, combined_features, clusters

# 5. Visualisasi kluster
def visualize_clusters(combined_features, clusters):
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(combined_features)
    
    # PCA results
    pca_df = pd.DataFrame(reduced_data, columns=['PCA1', 'PCA2'])
    pca_df['cluster'] = clusters
    
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='cluster', palette='Set1', alpha=0.7)
    plt.title('Clustering of Learning Paths with Combined Features')
    plt.show()

# Contoh penggunaan
user_preferences = ["Machine Learning", "Python", "Data Science"]  # Preferensi pengguna
recommendations, combined_features, clusters = cluster_and_recommend(data, user_preferences)

print("\nTop 3 Recommended Learning Paths:")
print(recommendations[['Course Name_x', 'similarity']])

# Visualisasi kluster
visualize_clusters(combined_features, clusters)




Epoch 1/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 3.4711 - val_loss: 1.1592
Epoch 2/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.8151 - val_loss: 0.4435
Epoch 3/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.3163 - val_loss: 0.2760
Epoch 4/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1968 - val_loss: 0.2356
Epoch 5/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1664 - val_loss: 0.2262
Epoch 6/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1601 - val_loss: 0.2238
Epoch 7/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1598 - val_loss: 0.2232
Epoch 8/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1581 - val_loss: 0.2228
Epoch 9/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

ValueError: Input 0 of layer "functional_5" is incompatible with the layer: expected shape=(None, 409), found shape=(32, 7)