# Run MSBEGCL on Kaggle

This notebook sets up the environment, compiles the necessary C++ mining tools, prepares the data, runs the mining algorithm to generate bicliques, and finally trains the MSBEGCL recommender system.

In [None]:
import os, sys, subprocess, time, shutil

# --- Configuration ---
repo_url = 'https://github.com/yangzeha/MSBEGCL.git'
repo_dir = 'MSBEGCL'
model_name = 'MSBEGCL'
dataset_name = 'yelp2018'

# 1. Clean and Clone Repository
# Force delete existing folder to avoid stale/broken state from previous failed runs
if os.path.exists(repo_dir):
    print(f"Removing existing '{repo_dir}' to ensure a fresh clone...")
    try:
        shutil.rmtree(repo_dir)
        print("Removal successful.")
    except Exception as e:
        print(f"Error removing directory: {e}")
        # Try shell command if python fails (sometimes permission issues on windows/linux vary)
        subprocess.run(['rm', '-rf', repo_dir])

print(f'Cloning {repo_dir} from {repo_url} (branch: master)...')
try:
    subprocess.run(['git', 'clone', '-b', 'master', repo_url], check=True)
    print("Clone successful.")
except subprocess.CalledProcessError as e:
    print(f"Git clone failed: {e}")
    sys.exit(1)

# 2. Setup Directories
if os.path.basename(os.getcwd()) != repo_dir:
    os.chdir(repo_dir)
print(f'Current working directory: {os.getcwd()}')

# Ensure we are in the root of MSBEGCL which should contain SELFRec and Similar-Biclique-Idx
selfrec_path = 'SELFRec'
msbe_path = 'Similar-Biclique-Idx'

# Debug: Verify Content
print('\n--- Directory Structure Check ---')
print(f"Root contents: {os.listdir('.')}")
if os.path.exists(msbe_path):
    print(f"Contents of {msbe_path}: {os.listdir(msbe_path)}")
    datasets_path = os.path.join(msbe_path, 'datasets')
    if os.path.exists(datasets_path):
        print(f"Contents of {datasets_path}: {os.listdir(datasets_path)}")
    else:
        print(f"Error: {datasets_path} does not exist.")
else:
    print(f"Error: {msbe_path} does not exist.")


# 3. Install Dependencies
print('\n--- Installing Python Dependencies ---')
subprocess.run([sys.executable, '-m', 'pip', 'install', 'PyYAML==6.0.2', 'scipy==1.14.1', '-q'], check=True)
try:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'faiss-cpu', '-q'], check=True)
except:
    print("faiss-cpu install failed, continuing...")

# 4. Compile C++ Tools (MSBE Mining)
print('\n--- Compiling C++ Mining Tools ---')

# Compile edgelist2binary
edgelist2binary_src = os.path.join(msbe_path, 'datasets', 'edgelist2binary.cpp')
edgelist2binary_exe = './edgelist2binary'

if not os.path.exists(edgelist2binary_src):
    print(f"CRITICAL ERROR: Source file {edgelist2binary_src} not found!")
else:
    subprocess.run(['g++', '-O3', edgelist2binary_src, '-o', edgelist2binary_exe], check=True)
    print('edgelist2binary compiled.')

# Compile msbe
msbe_src = os.path.join(msbe_path, 'main.cpp')
msbe_exe = './msbe'
if not os.path.exists(msbe_src):
    print(f"CRITICAL ERROR: Source file {msbe_src} not found!")
else:
    subprocess.run(['g++', '-O3', msbe_src, '-o', msbe_exe, '-I', msbe_path, '-D_PrintResults_'], check=True)
    print('msbe compiled.')

# 5. Data Preprocessing (Text -> Binary for Mining)
print(f'\n--- Preprocessing {dataset_name} for Mining ---')
train_file = os.path.join(selfrec_path, 'dataset', dataset_name, 'train.txt')
mining_graph_txt = 'graph.txt'

if not os.path.exists(train_file):
    print(f"CRITICAL ERROR: Data file {train_file} not found!")
else:
    # Read Train Data & Map IDs
    users = set()
    items = set()
    edges = []
    with open(train_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                u, i = parts[0], parts[1]
                users.add(u)
                items.add(i)
                edges.append((u, i))

    sorted_users = sorted(list(users))
    sorted_items = sorted(list(items))
    u_map = {u: idx for idx, u in enumerate(sorted_users)}
    i_map = {i: idx for idx, i in enumerate(sorted_items)}

    n1 = len(users)
    n2 = len(items)
    m_val = len(edges) * 2 
   
    with open(mining_graph_txt, 'w') as f:
        f.write(f"{n1} {n2} {len(edges)}\n")
        for u, i in edges:
            # Write UserID and ItemID+n1
            f.write(f"{u_map[u]} {i_map[i] + n1}\n")

    print(f'Generated {mining_graph_txt} with {n1} users, {n2} items, {len(edges)} edges.')

    # Convert to Binary
    if os.path.exists(edgelist2binary_exe):
        subprocess.run([edgelist2binary_exe, mining_graph_txt], check=True)
        # Expected outputs: graph_b_degree.bin, graph_b_adj.bin
    else:
         print("Skipping binary conversion due to compilation failure.")

# 6. Run Mining
print('\n--- Mining Bicliques ---')
# Parameters from yaml or paper defaults
sim_threshold = 0.8  # epsilon
size_threshold = 2   # tau

if os.path.exists(msbe_exe) and os.path.exists(mining_graph_txt):
    # A. Build Index
    print('Building Index...')
    # ./msbe graph.txt 1 1 0.3 GRL3
    subprocess.run([msbe_exe, mining_graph_txt, '1', '1', '0.3', 'GRL3'], check=True)

    # B. Enumerate
    print('Enumerating...')
    raw_bicliques_file = 'bicliques_raw.txt'
    with open(raw_bicliques_file, 'w') as outfile:
        # ./msbe graph.txt 0 1 0.3 GRL3 1 GRL3 0 0 heu 4 epsilon tau 2
        subprocess.run([
            msbe_exe, mining_graph_txt, 
            '0', '1', '0.3', 'GRL3', 
            '1', 'GRL3', 
            '0', '0', 'heu', 
            '4', str(sim_threshold), str(size_threshold), '2'
        ], stdout=outfile, check=True)
else:
    print("Skipping mining due to compliation or data failure.")

# 7. Process Bicliques -> Model Format
print('\n--- Formatting Bicliques for Model ---')
final_biclique_path = os.path.join(selfrec_path, 'dataset', dataset_name, 'bicliques.txt')
count = 0

if os.path.exists('bicliques_raw.txt'):
    with open('bicliques_raw.txt', 'r') as fr, open(final_biclique_path, 'w') as fw:
        for line in fr:
            line = line.strip()
            if not line or not line[0].isdigit():
                continue
                
            tokens = line.replace('|', ' ').replace(':', ' ').replace(',', ' ').split()
            current_users = []
            current_items = []
            
            for t in tokens:
                if not t.isdigit():
                    continue 
                nid = int(t)
                if nid < n1:
                    if nid in sorted_users: 
                        current_users.append(sorted_users[nid])
                else:
                    iid = nid - n1
                    if iid >= 0 and iid < n2:
                       current_items.append(sorted_items[iid])
            
            if len(current_users) > 0 and len(current_items) > 0:
                # Write in standard format for MSBEGCL
                u_str = " ".join(current_users)
                i_str = " ".join(current_items)
                fw.write(f"{u_str} | {i_str}\n")
                count += 1
    print(f"Processed {count} bicliques into {final_biclique_path}")
else:
    print("Warning: bicliques_raw.txt not found. Using empty or previous file?")

# 8. Update Configuration
conf_path = os.path.join(selfrec_path, 'conf', 'MSBEGCL.yaml')
with open(conf_path, 'r') as f:
    conf_content = f.read()

# Update path to be local to SELFRec execution
new_path = f'./dataset/{dataset_name}/bicliques.txt'
import re
conf_content = re.sub(r'biclique\.file:.*', f'biclique.file: {new_path}', conf_content)

with open(conf_path, 'w') as f:
    f.write(conf_content)
print("Updated MSBEGCL.yaml with correct biclique path.")

# 9. Run MSBEGCL
print('\n--- Starting Traning ---')
os.chdir(selfrec_path)

process = subprocess.Popen(
    [sys.executable, '-u', 'main.py'],
    stdin=subprocess.PIPE,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT, 
    text=True,
    bufsize=1
)

try:
    process.stdin.write(f'{model_name}\n')
    process.stdin.flush()
    process.stdin.close()
except Exception as e:
    print(f"Error writing to stdin: {e}")

# Streaming Output
while True:
    line = process.stdout.readline()
    if not line and process.poll() is not None:
        break
    if line:
        print(line.strip())

if process.poll() != 0:
    print("Training failed.")
else:
    print("Training finished successfully.")