# Run MSBEGCL on Kaggle

This notebook sets up the environment, compiles the necessary C++ mining tools, prepares the data, runs the mining algorithm to generate bicliques, and finally trains the MSBEGCL recommender system.

In [None]:
import os, sys, subprocess, time, shutil

# --- Configuration ---
repo_url = 'https://github.com/yangzeha/MSBEGCL.git'
repo_dir = 'MSBEGCL'
model_name = 'MSBEGCL'
dataset_name = 'yelp2018'

# 1. Clone Repository
if not os.path.exists(repo_dir):
    print(f'Cloning {repo_dir}...')
    subprocess.run(['git', 'clone', repo_url], check=True)
else:
    print(f'{repo_dir} already exists.')

# 2. Setup Directories
if os.path.basename(os.getcwd()) != repo_dir:
    os.chdir(repo_dir)
print(f'Current working directory: {os.getcwd()}')

# Ensure we are in the root of MSBEGCL which should contain SELFRec and Similar-Biclique-Idx
selfrec_path = 'SELFRec'
msbe_path = 'Similar-Biclique-Idx'

# 3. Install Dependencies
print('\n--- Installing Python Dependencies ---')
subprocess.run([sys.executable, '-m', 'pip', 'install', 'PyYAML==6.0.2', 'scipy==1.14.1', '-q'], check=True)
try:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'faiss-cpu', '-q'], check=True)
except:
    print("faiss-cpu install failed, continuing...")

# 4. Compile C++ Tools (MSBE Mining)
print('\n--- Compiling C++ Mining Tools ---')

# Compile edgelist2binary
# Note: Assuming edgelist2binary.cpp is standalone or headers are available
edgelist2binary_src = os.path.join(msbe_path, 'datasets', 'edgelist2binary.cpp')
edgelist2binary_exe = './edgelist2binary'
subprocess.run(['g++', '-O3', edgelist2binary_src, '-o', edgelist2binary_exe], check=True)
print('edgelist2binary compiled.')

# Compile msbe
# We strictly need to define _PrintResults_ to get output to stdout
msbe_src = os.path.join(msbe_path, 'main.cpp')
msbe_exe = './msbe'
subprocess.run(['g++', '-O3', msbe_src, '-o', msbe_exe, '-I', msbe_path, '-D_PrintResults_'], check=True)
print('msbe compiled.')

# 5. Data Preprocessing (Text -> Binary for Mining)
print(f'\n--- Preprocessing {dataset_name} for Mining ---')
train_file = os.path.join(selfrec_path, 'dataset', dataset_name, 'train.txt')
mining_graph_txt = 'graph.txt'

# Read Train Data & Map IDs
users = set()
items = set()
edges = []
with open(train_file, 'r') as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) >= 2:
            u, i = parts[0], parts[1]
            users.add(u)
            items.add(i)
            edges.append((u, i))

sorted_users = sorted(list(users))
sorted_items = sorted(list(items))
u_map = {u: idx for idx, u in enumerate(sorted_users)}
i_map = {i: idx for idx, i in enumerate(sorted_items)}

n1 = len(users)
n2 = len(items)
m_val = len(edges) * 2 # Edgelist2binary expects 2x edges count usually or simple count? 
# Based on edgelist2binary.cpp reading: sscanf(buf, "%u%u%u", &t_n1, &t_n2, &t_m);
# It just logs these. The loop reads edges. 
# Crucially: Items must be offset by n1 for the tool to distinguish U and V partitions.

with open(mining_graph_txt, 'w') as f:
    f.write(f"{n1} {n2} {len(edges)}\n")
    for u, i in edges:
        # Write UserID and ItemID+n1
        f.write(f"{u_map[u]} {i_map[i] + n1}\n")

print(f'Generated {mining_graph_txt} with {n1} users, {n2} items, {len(edges)} edges.')

# Convert to Binary
subprocess.run([edgelist2binary_exe, mining_graph_txt], check=True)
# Expected outputs: graph_b_degree.bin, graph_b_adj.bin

# 6. Run Mining
print('\n--- Mining Bicliques ---')
# Parameters from yaml or paper defaults
sim_threshold = 0.8  # epsilon
size_threshold = 2   # tau

# A. Build Index
print('Building Index...')
# ./msbe graph.txt 1 1 0.3 GRL3
subprocess.run([msbe_exe, mining_graph_txt, '1', '1', '0.3', 'GRL3'], check=True)

# B. Enumerate
print('Enumerating...')
raw_bicliques_file = 'bicliques_raw.txt'
with open(raw_bicliques_file, 'w') as outfile:
    # ./msbe graph.txt 0 1 0.3 GRL3 1 GRL3 0 0 heu 4 epsilon tau 2
    subprocess.run([
        msbe_exe, mining_graph_txt, 
        '0', '1', '0.3', 'GRL3', 
        '1', 'GRL3', 
        '0', '0', 'heu', 
        '4', str(sim_threshold), str(size_threshold), '2'
    ], stdout=outfile, check=True)

# 7. Process Bicliques -> Model Format
print('\n--- Formatting Bicliques for Model ---')
final_biclique_path = os.path.join(selfrec_path, 'dataset', dataset_name, 'bicliques.txt')
count = 0

with open(raw_bicliques_file, 'r') as fr, open(final_biclique_path, 'w') as fw:
    for line in fr:
        line = line.strip()
        # Check if line looks like a biclique: e.g., contains ids.
        # Format assumption: MSBE likely prints lines of vertices.
        # If format is unknown, we might need manual check. 
        # But Assuming it prints: "u1 u2 ... | v1 v2 ..." or similar sets.
        # If the tool prints textual description, we filter.
        # Heuristic: Line must contain spaces and numbers.
        if not line or not line[0].isdigit():
            continue
            
        # Try to split by some delimiter if exists, else it might be just a list
        # Since MSBEGCL logic expects partitions.
        # Let's hope the `_PrintResults_` format is standard.
        # If not, this step checks raw content. 
        # For now, we just copy content BUT we need to Reverse Map IDs.
        
        # To implement Reverse Map safely without knowing exact delimiters:
        # We treat all tokens as numbers. If < n1 -> User, >= n1 -> Item.
        
        tokens = line.replace('|', ' ').replace(':', ' ').replace(',', ' ').split()
        current_users = []
        current_items = []
        
        valid_line = True
        for t in tokens:
            if not t.isdigit():
                continue # Skip labels like "Size:"
            nid = int(t)
            if nid < n1:
                if nid in sorted_users: # Index check
                    current_users.append(sorted_users[nid])
            else:
                iid = nid - n1
                if iid >= 0 and iid < n2:
                   current_items.append(sorted_items[iid])
        
        if len(current_users) > 0 and len(current_items) > 0:
            # Write in standard format for MSBEGCL
            u_str = " ".join(current_users)
            i_str = " ".join(current_items)
            fw.write(f"{u_str} | {i_str}\n")
            count += 1

print(f"Processed {count} bicliques into {final_biclique_path}")

# 8. Update Configuration
conf_path = os.path.join(selfrec_path, 'conf', 'MSBEGCL.yaml')
with open(conf_path, 'r') as f:
    conf_content = f.read()

# Update path to be local to SELFRec execution
new_path = f'./dataset/{dataset_name}/bicliques.txt'
import re
conf_content = re.sub(r'biclique\.file:.*', f'biclique.file: {new_path}', conf_content)

with open(conf_path, 'w') as f:
    f.write(conf_content)
print("Updated MSBEGCL.yaml with correct biclique path.")

# 9. Run MSBEGCL
print('\n--- Starting Traning ---')
os.chdir(selfrec_path)

process = subprocess.Popen(
    [sys.executable, '-u', 'main.py'],
    stdin=subprocess.PIPE,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT, 
    text=True,
    bufsize=1
)

try:
    process.stdin.write(f'{model_name}\n')
    process.stdin.flush()
    process.stdin.close()
except Exception as e:
    print(f"Error writing to stdin: {e}")

# Streaming Output
while True:
    line = process.stdout.readline()
    if not line and process.poll() is not None:
        break
    if line:
        print(line.strip())

if process.poll() != 0:
    print("Training failed.")
else:
    print("Training finished successfully.")

# 在 Kaggle 上运行 SimGCL (SELFRec)

本 notebook 会：
- 克隆仓库 https://github.com/yangzeha/SELFRec.git
- 安装所需的依赖（调整 faiss 到 faiss-cpu 以便在 Kaggle 上安装）
- 进入仓库并运行 `SimGCL` 模型（非交互式）

使用方法：将此 notebook 上传到 Kaggle，运行全部单元格。运行时间与 Kaggle 资源有关。

In [None]:
import os, sys, subprocess, time

# 核心配置（适配你的MSBEGCL仓库）
repo_dir = 'MSBEGCL'  # 仓库克隆后的目录名（与仓库名一致）
model_name = 'MSBEGCL'  # 要运行的模型名称

# 1) 克隆仓库（如已存在则跳过）
if not os.path.exists(repo_dir):
    print('Cloning MSBEGCL repository...')
    subprocess.run(['git','clone','https://github.com/yangzeha/MSBEGCL.git'], check=True)
else:
    print(f'{repo_dir} already exists')

# 2) 进入仓库目录
if os.path.basename(os.getcwd()) != repo_dir:
    os.chdir(repo_dir)

# [FIX]: 检测并进入 SELFRec 子目录（如果存在）
# 因为项目结构是 MSBEGCL/SELFRec/main.py
if os.path.exists('SELFRec') and os.path.isdir('SELFRec'):
    print('Entering subdirectory: SELFRec (detected project structure)')
    os.chdir('SELFRec')

print('Current dir:', os.getcwd())
# 打印当前目录文件列表以供调试
print('Files in current dir:', os.listdir('.'))

# 3) 展示 requirements.txt（核对）
try:
    if os.path.exists('requirements.txt'):
        print('\n--- requirements.txt ---')
        print(open('requirements.txt','r',encoding='utf-8').read())
    else:
        print('\nWarning: requirements.txt not found in current directory.')
except Exception as e:
    print('Failed to read requirements.txt:', e)

# 4) 安装依赖（保留原适配逻辑）
print('\nStart installing dependencies...')
commands = [
    [sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip', '-q'],
    [sys.executable, '-m', 'pip', 'install', 'PyYAML==6.0.2', 'scipy==1.14.1', '-q'],
]
for cmd in commands:
    print('Running:', ' '.join(cmd))
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print('Command failed (warning):', e)

# 单独尝试安装 faiss-cpu
try:
    print('Installing faiss-cpu...')
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'faiss-cpu==1.13.1', '-q'], check=True)
except subprocess.CalledProcessError:
    print('faiss-cpu==1.13.1 failed, attempting to install latest faiss-cpu (non-fatal)')
    try:
        subprocess.run([sys.executable, '-m', 'pip', 'install', 'faiss-cpu', '-q'], check=True)
    except subprocess.CalledProcessError as e2:
        print('faiss-cpu install failed (continuing):', e2)
print('Dependency installation step finished.')

# 5) 检查 torch 状态
try:
    import torch
    print('torch version:', torch.__version__)
    print('cuda available:', torch.cuda.is_available())
except Exception as e:
    print('torch not available or import failed:', e)

# 6) 实时运行 MSBEGCL 模型
print(f'\nStart running {model_name} via main.py (streaming output)...')
start = time.time()

try:
    process = subprocess.Popen(
        [sys.executable, '-u', 'main.py'],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT, 
        text=True,
        bufsize=1
    )
    
    # 写入模型名称 MSBEGCL 并回车
    try:
        process.stdin.write(f'{model_name}\n')
        process.stdin.flush()
        process.stdin.close()
    except Exception as e:
        print(f"Error writing to stdin: {e}")

    # 循环读取输出
    print("\n--- Model Output Start ---\n")
    while True:
        line = process.stdout.readline()
        if not line and process.poll() is not None:
            break
        if line:
            print(line.strip())
            
    rc = process.poll()
    print(f"\n--- Model Output End (Exit Code: {rc}) ---")
    
    if rc != 0:
        print(f"{model_name} execution failed.")
    
except Exception as e:
    print('Running main.py failed with exception:', e)

print(f'\nRun finished. Total time: {time.time() - start:.2f}s')