In [5]:
import zipfile, os

zip_path = "/content/archive (1).zip"   # ← GANTI sesuai nama file zip Anda
extract_dir = "/content/ml_data"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_dir)

os.listdir(extract_dir)


['ml-100k']

In [6]:
import pandas as pd

base_path = "/content/ml_data/ml-100k"   # ← sesuaikan dgn folder Anda

ratings = pd.read_csv(
    f"{base_path}/u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

movies = pd.read_csv(
    f"{base_path}/u.item",
    sep="|",
    header=None,
    encoding="latin-1",
    names=[
        "movie_id", "title", "release_date", "video_release", "imdb_url",
        "unknown","Action","Adventure","Animation","Children","Comedy","Crime",
        "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical",
        "Mystery","Romance","Sci-Fi","Thriller","War","Western"
    ]
)[["movie_id","title"]]


In [7]:
ratings_pivot = ratings.pivot_table(
    index="user_id",
    columns="movie_id",
    values="rating",
    fill_value=0
)

ratings_matrix = ratings_pivot.values
num_users, num_items = ratings_matrix.shape
print("Users :", num_users)
print("Movies:", num_items)


Users : 943
Movies: 1682


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class RatingDataset(Dataset):
    def __init__(self, matrix):
        self.matrix = torch.tensor(matrix, dtype=torch.float32)

    def __len__(self):
        return self.matrix.shape[0]

    def __getitem__(self, idx):
        return self.matrix[idx]

dataset = RatingDataset(ratings_matrix)
loader = DataLoader(dataset, batch_size=64, shuffle=True)


In [9]:
import torch.nn as nn

class AutoEncoder(nn.Module):
    def __init__(self, num_items):
        super().__init__()
        self.encoder = nn.Linear(num_items, 256)
        self.decoder = nn.Linear(256, num_items)
        self.relu = nn.ReLU()

    def forward(self, x):
        h = self.relu(self.encoder(x))
        out = self.decoder(h)
        return out


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoEncoder(num_items).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

for epoch in range(10):
    model.train()
    epoch_loss = 0

    for batch in loader:
        batch = batch.to(device)
        recon = model(batch)

        mask = batch != 0
        loss = criterion(recon[mask], batch[mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss = {epoch_loss:.4f}")


Epoch 1, Loss = 112.3603
Epoch 2, Loss = 58.0676
Epoch 3, Loss = 35.7555
Epoch 4, Loss = 25.3991
Epoch 5, Loss = 21.3403
Epoch 6, Loss = 17.7116
Epoch 7, Loss = 15.2857
Epoch 8, Loss = 13.3668
Epoch 9, Loss = 11.9447
Epoch 10, Loss = 10.9270


In [11]:
import numpy as np

def recommend_for_user(user_id, top_n=10):
    model.eval()
    with torch.no_grad():
        user_vector = ratings_pivot.loc[user_id].values
        user_tensor = torch.tensor(user_vector, dtype=torch.float32).to(device)

        recon = model(user_tensor).cpu().numpy()

        unseen = np.where(user_vector == 0)[0]
        scores = recon[unseen]

        top_idx = unseen[np.argsort(scores)[-top_n:]]
        movie_ids = ratings_pivot.columns[top_idx]

        return movies[movies.movie_id.isin(movie_ids)][["movie_id","title"]]


In [12]:
print("\n=== Rekomendasi Film untuk User 1 ===")
recommend_for_user(1, top_n=10)



=== Rekomendasi Film untuk User 1 ===


Unnamed: 0,movie_id,title
1236,1237,Twisted (1996)
1292,1293,Star Kid (1997)
1416,1417,"Turning, The (1992)"
1430,1431,Legal Deceit (1997)
1462,1463,"Boys, Les (1997)"
1499,1500,Santa with Muscles (1996)
1624,1625,Nightwatch (1997)
1641,1642,Some Mother's Son (1996)
1652,1653,Entertaining Angels: The Dorothy Day Story (1996)
1661,1662,Rough Magic (1995)


In [13]:
# Neural Content Filtering - MovieLens (UBASE / ml-100k)
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [14]:
# 1) Path dataset (ubah jika perlu)
# ------------------------------
base_path = "/content/ml_data/ml-100k"   # <-- ganti folder Anda yang berisi u.data dan u.item

ratings_path = os.path.join(base_path, "u.data")
movies_path  = os.path.join(base_path, "u.item")


In [15]:
# ------------------------------
# 2) Load data (UBASE / ml-100k)
# ------------------------------
ratings = pd.read_csv(ratings_path, sep="\t", names=["user_id","movie_id","rating","timestamp"])
movies_raw = pd.read_csv(movies_path, sep="|", header=None, encoding="latin-1",
                         names=[
                             "movie_id","title","release_date","video_release","imdb_url",
                             "unknown","Action","Adventure","Animation","Children","Comedy","Crime",
                             "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery",
                             "Romance","Sci-Fi","Thriller","War","Western"
                         ])

# Keep only movie_id and title + genres
genre_cols = ["unknown","Action","Adventure","Animation","Children","Comedy","Crime",
              "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery",
              "Romance","Sci-Fi","Thriller","War","Western"]
movies = movies_raw[["movie_id","title"] + genre_cols].copy()

# Build genre lists per movie
def genres_from_row(row):
    gs = []
    for g in genre_cols:
        if row[g] == 1:
            gs.append(g)
    return gs

movies["genres_list"] = movies.apply(genres_from_row, axis=1)

In [16]:
# ------------------------------
# 3) Item content vector: TF-IDF(title) + one-hot genres
# ------------------------------
# TF-IDF on title
tfv = TfidfVectorizer(min_df=1,ngram_range=(1,2))
title_tfidf = tfv.fit_transform(movies["title"].fillna(""))

# One-hot genres
mlb = MultiLabelBinarizer()
genre_onehot = mlb.fit_transform(movies["genres_list"])

# Concatenate (as dense)
from scipy.sparse import hstack
item_content_sparse = hstack([title_tfidf, genre_onehot])  # sparse
item_content = item_content_sparse.tocsr().toarray()       # dense ndarray

# Optional: standardize content vectors
scaler = StandardScaler(with_mean=True, with_std=True)
item_content = scaler.fit_transform(item_content)

# Map movie_id -> content index
movie_ids = movies["movie_id"].values
movieid_to_idx = {mid: idx for idx, mid in enumerate(movie_ids)}
idx_to_movieid = {v:k for k,v in movieid_to_idx.items()}

In [17]:
# 4) Prepare ratings matrix / data for training
# ------------------------------
num_users = ratings["user_id"].max()
num_items = len(movie_ids)
print(f"Num users: {num_users}, Num items (in u.item): {num_items}")

# Create training triples (user_idx, item_idx, rating)
train_rows = []
for r in ratings.itertuples(index=False):
    user = int(r.user_id)-1
    item = movieid_to_idx.get(int(r.movie_id), None)
    if item is None:
        continue
    rating = float(r.rating)
    train_rows.append((user, item, rating))

train_df = pd.DataFrame(train_rows, columns=["user","item","rating"])

Num users: 943, Num items (in u.item): 1682


In [18]:
# ------------------------------
# 5) Dataset & DataLoader
# ------------------------------
class RatingDataset(Dataset):
    def __init__(self, df):
        self.user = torch.LongTensor(df["user"].values)
        self.item = torch.LongTensor(df["item"].values)
        self.rating = torch.FloatTensor(df["rating"].values)
    def __len__(self):
        return len(self.rating)
    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.rating[idx]

dataset = RatingDataset(train_df)
loader = DataLoader(dataset, batch_size=512, shuffle=True, drop_last=False)


In [19]:
# ------------------------------
# 6) Model: user embedding + item content MLP -> predict rating
# ------------------------------
class NeuralContentFilter(nn.Module):
    def __init__(self, n_users, item_content_dim, user_emb_dim=64, hidden_dims=[128,64]):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, user_emb_dim)
        # Project item content to a compact embedding
        self.item_fc = nn.Sequential(
            nn.Linear(item_content_dim, hidden_dims[0]),
            nn.ReLU()
        )
        # Combined MLP
        in_dim = user_emb_dim + hidden_dims[0]
        layers = []
        last = in_dim
        for h in hidden_dims[1:]:
            layers.append(nn.Linear(last, h))
            layers.append(nn.ReLU())
            last = h
        layers.append(nn.Linear(last, 1))  # output rating
        self.mlp = nn.Sequential(*layers)
    def forward(self, user_idx, item_content_batch):
        u = self.user_emb(user_idx)                # (B, user_emb_dim)
        i = self.item_fc(item_content_batch)       # (B, proj_dim)
        x = torch.cat([u, i], dim=1)
        out = self.mlp(x).squeeze(1)
        return out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralContentFilter(n_users=num_users, item_content_dim=item_content.shape[1]).to(device)

In [20]:
# ------------------------------
# 7) Training setup
# ------------------------------
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.MSELoss()

# Convert item_content to torch tensor on CPU (we will index into it)
item_content_tensor = torch.FloatTensor(item_content)  # shape (num_items, dim)


In [21]:
# ------------------------------
# 8) Training loop
# ------------------------------
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for (user_batch, item_batch, rating_batch) in loader:
        user_batch = user_batch.to(device)
        item_batch = item_batch.to(device)
        rating_batch = rating_batch.to(device)

        # gather item content vectors for this batch (on CPU then to device)
        item_vecs = item_content_tensor[item_batch.cpu().numpy()].to(device)

        preds = model(user_batch, item_vecs)
        loss = criterion(preds, rating_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * user_batch.size(0)

    rmse = np.sqrt(total_loss / len(dataset))
    print(f"Epoch {epoch+1}/{EPOCHS}  RMSE(train) = {rmse:.4f}")

Epoch 1/10  RMSE(train) = 1.2397
Epoch 2/10  RMSE(train) = 1.0075
Epoch 3/10  RMSE(train) = 0.9712
Epoch 4/10  RMSE(train) = 0.9504
Epoch 5/10  RMSE(train) = 0.9364
Epoch 6/10  RMSE(train) = 0.9268
Epoch 7/10  RMSE(train) = 0.9188
Epoch 8/10  RMSE(train) = 0.9096
Epoch 9/10  RMSE(train) = 0.9007
Epoch 10/10  RMSE(train) = 0.8934


In [22]:
# 9) Recommendation function (return movieId, predicted_score, title)
# ------------------------------
def recommend_for_user_nn(user_id, top_n=10):
    """
    user_id: original user_id (1-based as in u.data)
    """
    model.eval()
    uid = int(user_id) - 1
    # get items user has rated
    rated_items = set(train_df[train_df.user == uid].item.values.tolist())

    # prepare user index repeated for all items
    user_idx = torch.LongTensor([uid]*num_items).to(device)
    # all item content vectors
    all_item_vecs = item_content_tensor.to(device)

    with torch.no_grad():
        preds_all = model(user_idx, all_item_vecs).cpu().numpy()  # shape (num_items,)

    # mask already rated
    candidates = [(i, preds_all[i]) for i in range(num_items) if i not in rated_items]
    # sort by predicted score desc
    candidates.sort(key=lambda x: x[1], reverse=True)
    top = candidates[:top_n]

    out_rows = []
    for idx, score in top:
        mid = idx_to_movieid[idx]
        title = movies.loc[movies.movie_id==mid, "title"].values[0]
        out_rows.append((int(mid), float(score), title))
    return pd.DataFrame(out_rows, columns=["movie_id","predicted_rating","title"])

# ------------------------------
# 10) Contoh: rekomendasi untuk user 1
# ------------------------------
print(recommend_for_user_nn(1, top_n=10))

   movie_id  predicted_rating  \
0      1467          4.960351   
1       814          4.893803   
2      1449          4.839243   
3      1398          4.838151   
4       408          4.810671   
5       603          4.761233   
6       641          4.743670   
7      1599          4.739166   
8       513          4.716499   
9      1005          4.700873   

                                               title  
0               Saint of Fort Washington, The (1993)  
1                      Great Day in Harlem, A (1994)  
2                             Pather Panchali (1955)  
3                                        Anna (1996)  
4                              Close Shave, A (1995)  
5                                 Rear Window (1954)  
6                              Paths of Glory (1957)  
7                      Someone Else's America (1995)  
8                              Third Man, The (1949)  
9  Double vie de Véronique, La (Double Life of Ve...  


In [23]:
!pip install torch
!pip install gradio




In [25]:
import pandas as pd

df = pd.read_csv("/content/sample_movielens_merged.csv")

df.head(10)


Unnamed: 0,user,item,label,time,sex,age,occupation,genre1,genre2,genre3
0,4617,296,2,964138229,F,25,6,crime,drama,missing
1,1298,208,4,974849526,M,35,6,action,adventure,missing
2,4585,1769,4,964322774,M,35,7,action,thriller,missing
3,3706,1136,5,966376465,M,25,12,comedy,missing,missing
4,2137,1215,3,974640099,F,1,10,action,adventure,comedy
5,2461,1257,4,974170662,M,18,4,comedy,missing,missing
6,242,3148,3,977854274,F,18,4,drama,missing,missing
7,2211,932,4,974607346,M,45,6,romance,missing,missing
8,263,2115,2,976651827,F,25,7,action,adventure,missing
9,5184,866,5,961735308,M,18,20,crime,drama,romance


In [26]:
from sklearn.preprocessing import LabelEncoder

# encode user dan item
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_enc'] = user_encoder.fit_transform(df['user'])
df['item_enc'] = item_encoder.fit_transform(df['item'])

# encode genre
genre_encoder = LabelEncoder()
df['genre1_enc'] = genre_encoder.fit_transform(df['genre1'])
df['genre2_enc'] = genre_encoder.fit_transform(df['genre2'])
df['genre3_enc'] = genre_encoder.fit_transform(df['genre3'])


In [27]:
from sklearn.model_selection import train_test_split

feature_cols = ['user_enc', 'item_enc', 'genre1_enc', 'genre2_enc', 'genre3_enc']
target_col = 'label'

X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

class RatingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.long)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = RatingDataset(X_train, y_train)
test_ds  = RatingDataset(X_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=256)


In [29]:
import torch.nn as nn
import torch.nn.functional as F

class DeepFM(nn.Module):
    def __init__(self, n_users, n_items, n_genre):
        super().__init__()

        emb_dim = 16

        self.user_emb  = nn.Embedding(n_users, emb_dim)
        self.item_emb  = nn.Embedding(n_items, emb_dim)
        self.genre_emb = nn.Embedding(n_genre, emb_dim)

        # FM second-order
        self.linear = nn.Linear(5, 1)

        # Deep part
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim * 5, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        u  = self.user_emb(x[:,0])
        i  = self.item_emb(x[:,1])
        g1 = self.genre_emb(x[:,2])
        g2 = self.genre_emb(x[:,3])
        g3 = self.genre_emb(x[:,4])

        embeddings = torch.cat([u,i,g1,g2,g3], dim=1)

        deep_out = self.mlp(embeddings)
        linear_out = self.linear(x.float())

        out = deep_out + linear_out
        return out.squeeze()


In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = DeepFM(
    n_users = df['user_enc'].nunique(),
    n_items = df['item_enc'].nunique(),
    n_genre = df[['genre1_enc','genre2_enc','genre3_enc']].nunique().max()
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)

        pred = model(Xb)
        loss = criterion(pred, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss = {total_loss/len(train_loader):.4f}")


Epoch 1, Loss = 12192.0130
Epoch 2, Loss = 101.2101
Epoch 3, Loss = 33.3079
Epoch 4, Loss = 19.3111
Epoch 5, Loss = 12.6897


In [31]:
def predict_rating(user, item):
    if user not in df['user'].unique():
        return "User tidak ada di dataset."
    if item not in df['item'].unique():
        return "Item tidak ada di dataset."

    u = user_encoder.transform([user])[0]
    i = item_encoder.transform([item])[0]

    film = df[df['item'] == item].iloc[0]
    g1, g2, g3 = film['genre1_enc'], film['genre2_enc'], film['genre3_enc']

    X = torch.tensor([[u, i, g1, g2, g3]], dtype=torch.long).to(device)

    model.eval()
    with torch.no_grad():
        pred = model(X).cpu().numpy().item()   # <- FIX DI SINI

    return float(pred)


In [32]:
predict_rating(4617, 296)


2.5791378021240234

In [33]:
import gradio as gr

def gradio_predict(user, item):
    return predict_rating(int(user), int(item))

ui = gr.Interface(
    fn=gradio_predict,
    inputs=[gr.Number(label="User ID"), gr.Number(label="Item ID")],
    outputs=gr.Textbox(label="Predicted Rating"),
)

ui.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3bd7e5fa338981eb57.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


