In [1]:
!pip install pandas scikit-learn torch torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.3.0+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_scatter-2.1.2%2Bpt23cu121-cp311-cp311-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_sparse-0.6.18%2Bpt23cu121-cp311-cp311-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_cluster-1.6.3%2Bpt23cu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m103.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-spline-conv
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch

In [19]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repos.csv')
stars_df = pd.read_csv('stars.csv')

print(f'Loaded {len(users_df)} user, {len(repos_df)} repos, and {len(stars_df)} relationships')

Loaded 2617 user, 100 repos, and 2693 relationships


In [20]:
def preprocess_data(users_df, repos_df):

  # Missing value
  users_df['bio'].fillna('', inplace=True)
  repos_df['description'].fillna('', inplace=True)
  repos_df['language'].fillna('', inplace=True)
  repos_df['stargazers_count'].fillna(0, inplace=True)

  # TF-IDF
  all_text = pd.concat([users_df['bio'], repos_df['description'], repos_df['language']], ignore_index=True)
  vectorizer = TfidfVectorizer(max_features=128)
  vectorizer.fit(all_text)

  user_bio_features = vectorizer.transform(users_df['bio']).toarray()
  repo_desc_features = vectorizer.transform(repos_df['description']).toarray()
  repo_lang_features = vectorizer.transform(repos_df['language']).toarray()

  repo_stars = repos_df[['stargazers_count']].to_numpy(dtype=np.float32)
  repo_features = np.concatenate([repo_desc_features, repo_lang_features, repo_stars], axis=1)

  return user_bio_features, repo_features, vectorizer

In [21]:
print(f'Preprocessing data...')
user_features, repos_features, vectorizer = preprocess_data(users_df, repos_df)

Preprocessing data...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  users_df['bio'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repos_df['description'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

In [22]:
def create_hetero_object(users_df, repos_df, stars_df, user_features, repo_features):
  data = HeteroData()

  data['user'].x = torch.tensor(user_features, dtype=torch.float32)
  data['repo'].x = torch.tensor(repo_features, dtype=torch.float32)

  # Mapping from unique IDs (login/fullname)
  user_map = {login: i for i, login in enumerate(users_df['login'])}
  repo_map = {name: i for i, name in enumerate(repos_df['full_name'])}

  # Ubah source index dri edge ke index integer
  source_indices = torch.tensor([user_map[login] for login in stars_df['source']])
  target_indices = torch.tensor([repo_map[name] for name in stars_df['target']])

  edge_index = torch.stack([source_indices, target_indices], dim=0)

  # Add informasi edge ke objek graf
  # format (source_node_type, edge_type, target_node_type)
  data['user', 'stars', 'repo'].edge_index = edge_index

  return data, user_map, repo_map

In [23]:
print(f'Creating HeteroData object...')
graphdata, user_map, repo_map = create_hetero_object(users_df, repos_df, stars_df, user_features, repos_features)

print("Pytorch Geometric Graph Object succesfully created...")
print("Berikut adalah struktur graf:")
print(graphdata)

Creating HeteroData object...
Pytorch Geometric Graph Object succesfully created...
Berikut adalah struktur graf:
HeteroData(
  user={ x=[2617, 128] },
  repo={ x=[100, 257] },
  (user, stars, repo)={ edge_index=[2, 2693] }
)


In [24]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, HeteroConv

class HeteroGNN(torch.nn.Module):
  def __init__(self, hidden_channels, out_channels):
    super().__init__()

    self.conv1 = HeteroConv({
        ('user', 'stars', 'repo'): SAGEConv((-1, -1), hidden_channels),
        ('repo', 'rev_stars', 'user'): SAGEConv((-1, -1), hidden_channels),
    }, aggr='sum')

    self.conv2 = HeteroConv({
        ('user', 'stars', 'repo'): SAGEConv((-1, -1), out_channels),
        ('repo', 'rev_stars', 'user'): SAGEConv((-1, -1), out_channels),
    }, aggr='sum')

  def forward(self, x_dict, edge_index_dict):
    """
    Mendefinisikan feed forward model
    x_dict: Dictionary berisi fitur-fitur node
    edge_index_dict: Dictionary berisi konektvitas edge
    """

    hidden_embeds = self.conv1(x_dict, edge_index_dict)
    final_embeds = {key: x.relu() for key, x in hidden_embeds.items()}

    final_embeds = self.conv2(hidden_embeds, edge_index_dict)

    return final_embeds



# --- Verifikasi Model ---

import torch_geometric.transforms as T

transform = T.ToUndirected()
graph_data_undirected = transform(graphdata)

print('Struktur Graph setelah ditambahkan undirected edge:')
print(graph_data_undirected)


model = HeteroGNN(hidden_channels=64, out_channels=32)

print("\nArsitektur Model GNN:")
print(model)

with torch.no_grad():
  output_embeddings = model(graph_data_undirected.x_dict, graph_data_undirected.edge_index_dict)

print("\nOutput model (dictionary of embeddings):")
for node_type, embeddings in output_embeddings.items():
  print(f"  - Tipe Node: '{node_type}', Ukuran Embedding: {embeddings.shape}")

Struktur Graph setelah ditambahkan undirected edge:
HeteroData(
  user={ x=[2617, 128] },
  repo={ x=[100, 257] },
  (user, stars, repo)={ edge_index=[2, 2693] },
  (repo, rev_stars, user)={ edge_index=[2, 2693] }
)

Arsitektur Model GNN:
HeteroGNN(
  (conv1): HeteroConv(num_relations=2)
  (conv2): HeteroConv(num_relations=2)
)

Output model (dictionary of embeddings):
  - Tipe Node: 'repo', Ukuran Embedding: torch.Size([100, 32])
  - Tipe Node: 'user', Ukuran Embedding: torch.Size([2617, 32])


In [25]:
from operator import neg
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import torch_geometric.transforms as T


# --- 1. Mempersiapkan Data untuk Link Prediction ---
# Kita perlu membagi edge kita menjadi set train/validation/test.
# PyG menyediakan transform yang sangat berguna untuk ini.
# T.RandomLinkSplit akan:
#  - Membagi edge 'stars' menjadi 3 set.
#  - Membuat sampel edge negatif (link yang tidak ada) secara otomatis untuk setiap set.
#  - Menghapus edge validation & test dari graf utama agar model tidak "curang".

link_split_transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    split_labels=True,
    add_negative_train_samples=True,
    edge_types=[('user', 'stars', 'repo')],
    rev_edge_types=[('repo', 'rev_stars', 'user')],
)

train_data, val_data, test_data = link_split_transform(graph_data_undirected)

print("\n--- Data setelah di-split untuk Link Prediction ---")
print("Data Latih (Train):", train_data)
print("Data Validasi (Val):", val_data)
print("Data Uji (Test):", test_data)


# --- 2. Mendefinisikan Model Akhir dengan Decoder ---
# Model ini akan membungkus GNN (encoder) dan menambahkan logika decoder.

class Model(torch.nn.Module):
  def __init__(self, hidden_channels, out_channels):
    super().__init__()
    self.encoder = HeteroGNN(hidden_channels, out_channels)

  def forward(self, data):
    embeddings = self.encoder(data.x_dict, data.edge_index_dict)
    return embeddings

  def decode(self, embeddings, edge_label_index):
    user_embeds = embeddings['user'][edge_label_index[0]]
    repo_embeds = embeddings['repo'][edge_label_index[1]]

    preds = (user_embeds * repo_embeds).sum(dim=-1)
    return preds

  def decode_all(self, embeddings):
    preds = torch.matmul(embeddings['user'], embeddings['repo'].t())
    return preds


# --- 3. Training Loop ---

model = Model(hidden_channels=64, out_channels=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train():
  model.train()
  optimizer.zero_grad()

  # Dapetin embedding dri GNN
  embeddings =  model.encoder(train_data.x_dict, train_data.edge_index_dict)

  # Gabungin edge (+/-) manual
  edge_store = train_data['user', 'stars', 'repo']
  pos_index = edge_store.pos_edge_label_index
  neg_index = edge_store.neg_edge_label_index

  # Gabungin index dari edge (+/-)
  edge_label_index = torch.cat([pos_index, neg_index], dim=1)

  # Gabungin label (1 buat positif, 0 buat negatif)
  pos_label = edge_store.pos_edge_label
  neg_label = edge_store.neg_edge_label
  edge_label = torch.cat([pos_label, neg_label], dim=0)

  # Dapetin prediksi dari decoder
  preds = model.decode(embeddings, edge_label_index)

  # Loss
  loss = F.binary_cross_entropy_with_logits(preds, edge_label)

  # backpropagation
  loss.backward()
  optimizer.step()

  return float(loss)


# --- 4. Evaluation Loop ---
@torch.no_grad()
def test(data):
  model.eval()
  embeddings = model.encoder(data.x_dict, data.edge_index_dict)

  edge_store = data['user', 'stars', 'repo']
  pos_index = edge_store.pos_edge_label_index
  neg_index = edge_store.neg_edge_label_index
  edge_label_index = torch.cat([pos_index, neg_index], dim=1)

  pos_label = edge_store.pos_edge_label
  neg_label = edge_store.neg_edge_label
  edge_label = torch.cat([pos_label, neg_label], dim=0)

  preds = model.decode(embeddings, edge_label_index)

  # AUC
  return roc_auc_score(edge_label.cpu().numpy(), preds.cpu().numpy())


# --- 5. Jalankan Pelatihan ---
print("\n--- Training Model ---")
for epoch in range(1, 201):
  loss = train()

  if epoch % 10 == 0:
    train_auc = test(train_data)
    val_auc = test(val_data)
    test_auc = test(test_data)
    print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Train AUC: {train_auc:.4f}, Val AUC {val_auc:.4f}, Test AUC: {test_auc:.4f}")

print("\n--- Training Done ---")
final_test_auc = test(test_data)
print(f"Skor AUC final pada data uji: {final_test_auc:.4f}")



--- Data setelah di-split untuk Link Prediction ---
Data Latih (Train): HeteroData(
  user={ x=[2617, 128] },
  repo={ x=[100, 257] },
  (user, stars, repo)={
    edge_index=[2, 2155],
    pos_edge_label=[2155],
    pos_edge_label_index=[2, 2155],
    neg_edge_label=[2155],
    neg_edge_label_index=[2, 2155],
  },
  (repo, rev_stars, user)={ edge_index=[2, 2155] }
)
Data Validasi (Val): HeteroData(
  user={ x=[2617, 128] },
  repo={ x=[100, 257] },
  (user, stars, repo)={
    edge_index=[2, 2155],
    pos_edge_label=[269],
    pos_edge_label_index=[2, 269],
    neg_edge_label=[269],
    neg_edge_label_index=[2, 269],
  },
  (repo, rev_stars, user)={ edge_index=[2, 2155] }
)
Data Uji (Test): HeteroData(
  user={ x=[2617, 128] },
  repo={ x=[100, 257] },
  (user, stars, repo)={
    edge_index=[2, 2424],
    pos_edge_label=[269],
    pos_edge_label_index=[2, 269],
    neg_edge_label=[269],
    neg_edge_label_index=[2, 269],
  },
  (repo, rev_stars, user)={ edge_index=[2, 2424] }
)

--- T

In [26]:
import pickle
import os

SAVE_PATH = os.path.join('./', "saved_model_assets")
os.makedirs(SAVE_PATH, exist_ok=True)

print(f"Aset akan disimpan di: {SAVE_PATH}")

model_save_path = os.path.join(SAVE_PATH, "gnn_model_state.pth")
torch.save(model.state_dict(), model_save_path)
print(f"Model state dictionary berhasil disimpan ke: {model_save_path}")

vectorizer_save_path = os.path.join(SAVE_PATH, "tfidf_vectorizer.pkl")
with open(vectorizer_save_path, 'wb') as f:
  pickle.dump(vectorizer, f)
print(f'TF-IDF Vectorizer berhasil diisimpan ke: {vectorizer_save_path}')

user_map_path = os.path.join(SAVE_PATH, "user_map.pkl")
with open(user_map_path, 'wb') as f:
    pickle.dump(user_map, f)
print(f"User map berhasil disimpan ke: {user_map_path}")

repo_map_path = os.path.join(SAVE_PATH, "repo_map.pkl")
with open(repo_map_path, 'wb') as f:
    pickle.dump(repo_map, f)
print(f"Repo map berhasil disimpan ke: {repo_map_path}")

print("\n--- Semua aset model berhasil disimpan! ---")

Aset akan disimpan di: ./saved_model_assets
Model state dictionary berhasil disimpan ke: ./saved_model_assets/gnn_model_state.pth
TF-IDF Vectorizer berhasil diisimpan ke: ./saved_model_assets/tfidf_vectorizer.pkl
User map berhasil disimpan ke: ./saved_model_assets/user_map.pkl
Repo map berhasil disimpan ke: ./saved_model_assets/repo_map.pkl

--- Semua aset model berhasil disimpan! ---
