# Setup

In [None]:
%ls

In [None]:
DRIVE_URL = 'https://drive.google.com/drive/folders/1ARBY9cIGj_jigi5Y88CtUy-GMj2clrXj'
RAW_DATA_PATH = './raw'
PROCESSED_DATA_PATH = './data'

In [4]:
import gdown
import zipfile
import os

Loading matrix from ./data/bbc_binary.npy...
  Matrix Shape: (2225, 9635)
Calculating Ground Truth (jaccard_similarity) for 2474200 pairs...

Processing Threshold 0.1
  Valid Pairs: 258072
  > Running BinSketch...
  > Running BCS...
  > Running MinHash...
  Saved: .\result_jaccard_similarity_minus_log_mse_t0_1.png

Processing Threshold 0.2
  Valid Pairs: 2196
  > Running BinSketch...
  > Running BCS...
  > Running MinHash...
  Saved: .\result_jaccard_similarity_minus_log_mse_t0_2.png

Processing Threshold 0.3
  Valid Pairs: 376
  > Running BinSketch...
  > Running BCS...
  > Running MinHash...
  Saved: .\result_jaccard_similarity_minus_log_mse_t0_3.png

Processing Threshold 0.4
  Valid Pairs: 230
  > Running BinSketch...
  > Running BCS...
  > Running MinHash...
  Saved: .\result_jaccard_similarity_minus_log_mse_t0_4.png

Processing Threshold 0.5
  Valid Pairs: 191
  > Running BinSketch...
  > Running BCS...
  > Running MinHash...
  Saved: .\result_jaccard_similarity_minus_log_mse_t0_5

# Prepare dataset

In [None]:
def download_dataset(drive_url, target_folder):
    print(f"Processing: {drive_url}")
    
    # Create the folder if it doesn't exist
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    if "drive/folders" in drive_url or "folder" in drive_url:
        print("Downloading individual files directly...")
        gdown.download_folder(drive_url, output=target_folder, quiet=False, use_cookies=False)
        print(f"\n[SUCCESS] Folder contents downloaded to: {target_folder}")
    else:
        print("\n[INFO] Detected a Drive FILE link.")
        zip_path = os.path.join(target_folder, "temp_dataset.zip")
        output = gdown.download(drive_url, zip_path, quiet=False, fuzzy=True)
        
        if not output:
            print("[ERROR] Download failed.")
            return

        print(f"\nUnzipping {output}...")
        try:
            with zipfile.ZipFile(output, 'r') as zip_ref:
                zip_ref.extractall(target_folder)
            print(f"[SUCCESS] Extracted to: {target_folder}")
            
            # Clean up the zip file
            os.remove(output)
            
        except zipfile.BadZipFile:
            print("[ERROR] The downloaded file was not a valid zip file.")
            print("Check if the file on Drive is actually a .zip archive.")

In [None]:
download_dataset(DRIVE_URL, RAW_DATA_PATH)

In [None]:
!python convert.py

### Experiment 1: Accuracy of Estimation

#### Experiment on NYTimes to calculate $MSE$ using Inner Product

In [None]:
THRESHOLD = [120, 150, 180, 200, 220, 250, 270, 300]
threshold_str = ' '.join([str(t) for t in THRESHOLD])
DATASET = 'nytimes'
data_path = f'./data/{DATASET}_binary.npy'

!python main.py --seed 42 --data_path {data_path} --algo BinSketch BCS --threshold {threshold_str} --similarity_score inner_product --eval_metric mse

#### Experiment on ENRON to calculate $-log(MSE)$ using Cosine Similarity

In [None]:
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])
DATASET = 'enron'
data_path = f'./data/{DATASET}_binary.npy'

!python main.py --seed 42 --data_path {data_path} --algo BinSketch SimHash MinHash --threshold {threshold_str} --similarity_score cosine_similarity --eval_metric minus_log_mse

#### Experiment on NYTimes to calculate $-log(MSE)$ using Jaccard Similarity

In [1]:
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])
DATASET = 'nytimes'
data_path = f'./data/{DATASET}_binary.npy'

!python main.py --seed 42 --data_path {data_path} --algo BinSketch BCS MinHash --threshold {threshold_str} --similarity_score jaccard_similarity --eval_metric minus_log_mse

^C


#### Experiment on BBC to calculate $-log(MSE)$ using Jaccard Similarity

In [None]:
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])
DATASET = 'bbc'
data_path = f'./data/{DATASET}_binary.npy'

!python main.py --seed 42 --data_path {data_path} --algo BinSketch BCS MinHash --threshold {threshold_str} --similarity_score jaccard_similarity --eval_metric minus_log_mse

### Experiment 2: Ranking

# GPU-Accelerated Experiments

Enable GPU acceleration by adding `--use_gpu` flag to your experiments. This provides 5-20x speedup on large datasets.

**Requirements**: NVIDIA GPU with CUDA + CuPy installed (`pip install -e ".[gpu]"`)

In [None]:
# Example: Run the same experiment with GPU acceleration
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])
DATASET = 'nytimes'
data_path = f'./data/{DATASET}_binary.npy'

# Add --use_gpu flag for GPU acceleration
!python main.py --seed 42 --data_path {data_path} --algo BinSketch BCS MinHash --threshold {threshold_str} --similarity_score jaccard_similarity --eval_metric minus_log_mse --use_gpu