In [9]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- 1. Memuat Data ---
grouped_data = pd.read_csv("data/grouped_data.csv")

# --- 2. Preprocessing Data ---
# Encode categorical variables
label_encoder = LabelEncoder()
grouped_data['Level'] = label_encoder.fit_transform(grouped_data['Level'])  # Encode Level into numerical values
grouped_data['Price'] = grouped_data['Price'].map({'Berbayar': 1, 'Gratis': 0})  # Encode Price into binary (1: Berbayar, 0: Gratis)

# Gabungkan fitur teks menjadi satu kolom
indonesian_stopwords = stopwords.words('indonesian')
grouped_data['Combined Summary'] = (
    grouped_data['Learning Path'] + ' ' +
    grouped_data['Learning Path Summary'] + ' ' +
    grouped_data['Course Name_x'] + ' ' +
    grouped_data['Judul Modul/Tutorial']
)

# Menghilangkan stopwords
grouped_data['Combined Summary'] = grouped_data['Combined Summary'].apply(
    lambda x: ' '.join([word for word in word_tokenize(str(x)) if word not in indonesian_stopwords])
)

# --- 3. Membuat Dataset TensorFlow ---
# Dataset untuk TFRS
tf_grouped_data = tf.data.Dataset.from_tensor_slices({
    "course_id": grouped_data.index.astype(str),  # ID unik untuk kursus
    "combined_summary": grouped_data['Combined Summary']
}).shuffle(1000).batch(32)


# --- 4. Model TFRS ---
class RecommenderModel(tfrs.models.Model):
    def __init__(self, data_vocab):
        super().__init__()
        self.query_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=data_vocab, mask_token=None),
            tf.keras.layers.Embedding(len(data_vocab) + 1, 64)
        ])
        self.candidate_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=data_vocab, mask_token=None),
            tf.keras.layers.Embedding(len(data_vocab) + 1, 64)
        ])
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=tf_grouped_data.map(lambda x: tf.convert_to_tensor(x["combined_summary"]))   # Gunakan tf_grouped_data
            )
        )

    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model(features["combined_summary"])
        candidate_embeddings = self.candidate_model(features["combined_summary"])
        return self.task(query_embeddings, candidate_embeddings)

# Ekstrak vocabulary dari data
grouped_vocab = grouped_data['Combined Summary'].dropna().astype(str).unique().tolist()
print(type(grouped_vocab), grouped_vocab[:5])  # Cek tipe dan contoh data


# Inisialisasi model dengan vocabulary baru
model = RecommenderModel(grouped_vocab)

# --- 5. Melatih Model ---
model = RecommenderModel(grouped_vocab)
model.compile(optimizer=tf.keras.optimizers.Adam(0.01))
model.fit(tf_grouped_data, epochs=10)
for example in tf_grouped_data.take(1):  # Cek contoh data
    print(example)

# --- 6. Rekomendasi ---
def recommend_courses(user_skill, user_language, github_link):
    user_input = f"{user_skill} {user_language} {github_link}"
    user_vector = model.query_model(tf.constant([user_input]))
    # Ambil rekomendasi dari kandidat model
    scores, indices = model.task.metrics(user_vector, tf_grouped_data.map(lambda x: x["combined_summary"]))
    
    recommended_indices = indices[0].numpy()
    recommendations = grouped_data.iloc[recommended_indices]['Course Name_x'].values
    print("\nRecommended Courses:")
    for rec in recommendations:
        print(rec)


# --- 7. Penggunaan Fungsi ---
# Simulasi input pengguna
user_skill = "database"
user_language = "sql"
github_link = "ilovesqlsomuch"

# Menampilkan rekomendasi
recommend_courses(user_skill, user_language, github_link)


<class 'list'> ['14|Android Developer Kurikulum didesain persetujuan T...', '32|non learning path non learning path Belajar Memban...', '51|Android Developer Kurikulum didesain persetujuan T...', '80|Android Developer Kurikulum didesain persetujuan T...', '86|Data Scientist Kurikulum lengkap didesain tim expe...']


ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 

In [2]:
pip install tensorflow tensorflow-recommenders tensorflow-datasets


Collecting tensorflow-recommenders
  Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tensorflow-datasets
  Downloading tensorflow_datasets-4.9.7-py3-none-any.whl.metadata (9.6 kB)
Collecting dm-tree (from tensorflow-datasets)
  Downloading dm_tree-0.1.8-cp310-cp310-win_amd64.whl.metadata (2.0 kB)
Collecting immutabledict (from tensorflow-datasets)
  Downloading immutabledict-4.2.1-py3-none-any.whl.metadata (3.5 kB)
Collecting promise (from tensorflow-datasets)
  Downloading promise-2.3.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pyarrow (from tensorflow-datasets)
  Downloading pyarrow-18.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting simple-parsing (from tensorflow-datasets)
  Downloading simple_parsing-0.1.6-py3-none-any.whl.metadata (7.3 kB)
Collecting tensorflow-metadata (from tensorflow-datasets)
  Downloading tensorflow_metadata-1.16.1-py3-none-an

  You can safely remove it manually.


In [10]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- 1. Memuat Data ---
data = pd.read_csv("data/updatemergedata.csv")

# --- 2. Preprocessing Data ---
# Encode categorical variables
label_encoder = LabelEncoder()
data['Level'] = label_encoder.fit_transform(data['Level'])  # Encode Level into numerical values
data['Price'] = data['Price'].map({'Berbayar': 1, 'Gratis': 0})  # Encode Price into binary (1: Berbayar, 0: Gratis)

# Gabungkan fitur teks menjadi satu kolom
indonesian_stopwords = stopwords.words('indonesian')
data['Combined Summary'] = (
    data['Learning Path'] + ' ' +
    data['Learning Path Summary'] + ' ' +
    data['Course Name_x'] + ' ' +
    data['Course Summary']
)

# Menghilangkan stopwords
data['Combined Summary'] = data['Combined Summary'].apply(
    lambda x: ' '.join([word for word in word_tokenize(str(x)) if word not in indonesian_stopwords])
)

# --- 3. Membuat Dataset TensorFlow ---
# Dataset untuk TFRS
tf_data = tf.data.Dataset.from_tensor_slices({
    "course_id": data.index.astype(str),  # ID unik untuk kursus
    "combined_summary": data['Combined Summary']
}).shuffle(1000).batch(32)

# --- 4. Model TFRS ---
class RecommenderModel(tfrs.models.Model):
    def __init__(self, data_vocab):
        super().__init__()
        self.query_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=data_vocab, mask_token=None),
            tf.keras.layers.Embedding(len(data_vocab) + 1, 64)
        ])
        self.candidate_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=data_vocab, mask_token=None),
            tf.keras.layers.Embedding(len(data_vocab) + 1, 64)
        ])
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=tf_data.map(lambda x: x["combined_summary"])
            )
        )

    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model(features["combined_summary"])
        candidate_embeddings = self.candidate_model(features["combined_summary"])
        return self.task(query_embeddings, candidate_embeddings)

# Ekstrak vocabulary dari data
data_vocab = data['Combined Summary'].unique().tolist()

# Inisialisasi model
model = RecommenderModel(data_vocab)

# --- 5. Melatih Model ---
model.compile(optimizer=tf.keras.optimizers.Adam(0.01))
model.fit(tf_data, epochs=10)

# --- 6. Rekomendasi ---
def recommend_courses(user_skill, user_language, github_link):
    user_input = f"{user_skill} {user_language} {github_link}"
    user_vector = model.query_model(tf.constant([user_input]))
    scores, indices = model.task.factorized_top_k(user_vector)
    
    recommended_indices = indices[0].numpy()
    recommendations = data.iloc[recommended_indices]['Course Name_x'].values
    print("\nRecommended Courses:")
    for rec in recommendations:
        print(rec)

# --- 7. Penggunaan Fungsi ---
# Simulasi input pengguna
user_skill = "database"
user_language = "sql"
github_link = "ilovesqlsomuch"

# Menampilkan rekomendasi
recommend_courses(user_skill, user_language, github_link)


ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 