# Set up

In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")

In [2]:
BRANCH_NAME = 'feature/gpu-acceleration'
!git clone -b {BRANCH_NAME} https://{GITHUB_TOKEN}@github.com/tadtd/binsketch-algorithm

Cloning into 'binsketch-algorithm'...
remote: Enumerating objects: 222, done.[K
remote: Counting objects: 100% (222/222), done.[K
remote: Compressing objects: 100% (146/146), done.[K
remote: Total 222 (delta 128), reused 163 (delta 70), pack-reused 0 (from 0)[K
Receiving objects: 100% (222/222), 129.51 KiB | 2.49 MiB/s, done.
Resolving deltas: 100% (128/128), done.


In [3]:
%cd binsketch-algorithm

/kaggle/working/binsketch-algorithm


# Configuration

In [4]:
import os
import gdown
import zipfile

# Process data

In [5]:
DRIVE_URL = 'https://drive.google.com/drive/folders/1ARBY9cIGj_jigi5Y88CtUy-GMj2clrXj'
RAW_DATA_PATH = './raw'
PROCESSED_DATA_PATH = './data'

In [6]:
def download_dataset(drive_url, target_folder):
    print(f"Processing: {drive_url}")
    
    # Create the folder if it doesn't exist
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    if "drive/folders" in drive_url or "folder" in drive_url:
        print("Downloading individual files directly...")
        gdown.download_folder(drive_url, output=target_folder, quiet=False, use_cookies=False)
        print(f"\n[SUCCESS] Folder contents downloaded to: {target_folder}")
    else:
        print("\n[INFO] Detected a Drive FILE link.")
        zip_path = os.path.join(target_folder, "temp_dataset.zip")
        output = gdown.download(drive_url, zip_path, quiet=False, fuzzy=True)
        
        if not output:
            print("[ERROR] Download failed.")
            return

        print(f"\nUnzipping {output}...")
        try:
            with zipfile.ZipFile(output, 'r') as zip_ref:
                zip_ref.extractall(target_folder)
            print(f"[SUCCESS] Extracted to: {target_folder}")
            
            # Clean up the zip file
            os.remove(output)
            
        except zipfile.BadZipFile:
            print("[ERROR] The downloaded file was not a valid zip file.")
            print("Check if the file on Drive is actually a .zip archive.")

In [7]:
download_dataset(DRIVE_URL, RAW_DATA_PATH)

Processing: https://drive.google.com/drive/folders/1ARBY9cIGj_jigi5Y88CtUy-GMj2clrXj
Downloading individual files directly...


Retrieving folder contents


Retrieving folder 10Y_7o78v8HhztcE7bzgV3M94N0vXjt_U bbc
Processing file 1lgysq7G_lc_zc71dGGqWoHejDCixcOVr docword.bbc.txt.gz
Processing file 1kO9AOWWuACtNsgmA9yyp9C6pSoB8KW3U vocab.bbc.txt
Retrieving folder 1D9mMF6ealOinLAsmwXsZs9IMNPBqzuM0 enron
Processing file 1JuUxpaQRAl1yZGqb3xcSP8nfK3_1NZ8y docword.enron.txt.gz
Processing file 16Rn70xrTnYOIkVm2mz4ICR2bEbyzyvGQ vocab.enron.txt
Retrieving folder 1YMxNXk2-7Ok1_3gnIguPwWt365C5X7Et kos
Processing file 1c1bJ-eX5Rp729zGSeGGXqwyor4DfWsrx docword.kos.txt.gz
Processing file 1YL0wnFKLJz-h6emVWHYDfBET5cAZhdKi vocab.kos.txt
Retrieving folder 17JU-ouMBLAUilZiaxE1xzeKfpFvDy9PA nytimes
Processing file 1gsmnfyNEAtA_3kdU5GMhwnUX-vOlc_9A docword.nytimes.txt.gz
Processing file 1jAnAFekn8u-e_FO1tsElhr-gP_dvSXkx vocab.nytimes.txt


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1lgysq7G_lc_zc71dGGqWoHejDCixcOVr
To: /kaggle/working/binsketch-algorithm/raw/bbc/docword.bbc.txt.gz
100%|██████████| 490k/490k [00:00<00:00, 96.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kO9AOWWuACtNsgmA9yyp9C6pSoB8KW3U
To: /kaggle/working/binsketch-algorithm/raw/bbc/vocab.bbc.txt
100%|██████████| 77.2k/77.2k [00:00<00:00, 64.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JuUxpaQRAl1yZGqb3xcSP8nfK3_1NZ8y
To: /kaggle/working/binsketch-algorithm/raw/enron/docword.enron.txt.gz
100%|██████████| 12.3M/12.3M [00:00<00:00, 45.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=16Rn70xrTnYOIkVm2mz4ICR2bEbyzyvGQ
To: /kaggle/working/binsketch-algorithm/raw/enron/vocab.enron.txt
100%|██████████| 236k/236k [00:00<00:00, 79.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1c1bJ-eX5Rp729zGSeGGXqwyo


[SUCCESS] Folder contents downloaded to: ./raw



Download completed


In [8]:
!python convert.py


Processing BBC dataset
Loading vocabulary from raw/bbc/vocab.bbc.txt...
Loading document-word data from raw/bbc/docword.bbc.txt.gz...
  Documents: 2225, Words: 9635, Non-zeros: 158079
Creating sparse matrix...
Reading sparse data: 100%|████| 158079/158079 [00:00<00:00, 336348.69 entries/s]
Converting to DataFrame in batches of 500 documents...
Processing batches: 100%|█████████████████████| 5/5 [00:00<00:00, 90.74 batch/s]
Concatenating batches...
DataFrame shape: (2225, 9635)
Sparsity: 99.26%

BBC DataFrame Preview:
             ad  sale  boost  time  ...  quarterli  media  giant  jump
document_id                         ...                               
0             1     1      1     1  ...          0      1      1     0
1             0     0      1     1  ...          0      0      0     0
2             0     1      0     0  ...          0      0      1     0
3             0     0      0     0  ...          0      0      0     0
4             0     0      0  

# Experiment

## Experiment 1: Accuracy of Estimation

### Experiment on NYTimes to calculate $MSE$ using Inner Product

In [9]:
DATASET = 'nytimes'
SIMILARITY_SCORE = 'inner_product'
METRIC = 'mse'
data_path = f'./data/{DATASET}_binary.npy'
THRESHOLD = [120, 150, 180, 200, 220, 250, 270, 300]
threshold_str = ' '.join([str(t) for t in THRESHOLD])

In [10]:
!python save_ground_truth.py --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
Calculating Ground Truth (inner_product) for 12497500 pairs...
  Using GPU-accelerated batch computation...
Computing ground truth: 100%|████████████| 1250/1250 [01:55<00:00, 10.79batch/s]

Saving ground truth to ground_truth_nytimes_binary_inner_product.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 775.000000
  Mean similarity: 12.607236


In [11]:


!python main.py --algo BinSketch BCS \
                --data_path {data_path} \
                --ground_truth_path ground_truth_{DATASET}_binary_{SIMILARITY_SCORE}.json \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --eval_metric {METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
Loading ground truth from ground_truth_nytimes_binary_inner_product.json...
✓ Loaded 12497500 ground truth values
  Dataset: ./data/nytimes_binary.npy
  Similarity: inner_product
  Shape: (5000, 102660)
Loaded 12497500 ground truth pairs

Processing Threshold 120.0
  Valid Pairs: 1734
  > Running BinSketch...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  > Running BCS...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] C

### Experiments on ENRON to calculate $-\log(MSE)$ using Cosine Similarity

In [12]:
DATASET = 'enron'
SIMILARITY_SCORE = 'cosine_similarity'
METRIC = 'minus_log_mse'
data_path = f'./data/{DATASET}_binary.npy'
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])

In [13]:
!python save_ground_truth.py --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/enron_binary.npy...
  Matrix Shape: (5000, 28102)
Calculating Ground Truth (cosine_similarity) for 12497500 pairs...
  Using GPU-accelerated batch computation...
Computing ground truth: 100%|████████████| 1250/1250 [00:54<00:00, 22.92batch/s]

Saving ground truth to ground_truth_enron_binary_cosine_similarity.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.026620


In [14]:
!python main.py --algo BinSketch SimHash MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_{DATASET}_binary_{SIMILARITY_SCORE}.json \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --eval_metric {METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/enron_binary.npy...
  Matrix Shape: (5000, 28102)
Loading ground truth from ground_truth_enron_binary_cosine_similarity.json...
✓ Loaded 12497500 ground truth values
  Dataset: ./data/enron_binary.npy
  Similarity: cosine_similarity
  Shape: (5000, 28102)
Loaded 12497500 ground truth pairs

Processing Threshold 0.1
  Valid Pairs: 331970
  > Running BinSketch...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  > Running SimHash...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GP

### Experiments on NYTimes to calculate $- \log(MSE)$ using Jaccard Similarity

In [15]:
DATASET = 'nytimes'
SIMILARITY_SCORE = 'jaccard_similarity'
METRIC = 'minus_log_mse'
data_path = f'./data/{DATASET}_binary.npy'
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])

In [16]:
!python save_ground_truth.py --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
Calculating Ground Truth (jaccard_similarity) for 12497500 pairs...
  Using GPU-accelerated batch computation...
Computing ground truth: 100%|████████████| 1250/1250 [02:39<00:00,  7.83batch/s]

Saving ground truth to ground_truth_nytimes_binary_jaccard_similarity.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.027982


In [17]:
!python main.py --algo BinSketch BCS MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_{DATASET}_binary_{SIMILARITY_SCORE}.json \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --eval_metric {METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
Loading ground truth from ground_truth_nytimes_binary_jaccard_similarity.json...
✓ Loaded 12497500 ground truth values
  Dataset: ./data/nytimes_binary.npy
  Similarity: jaccard_similarity
  Shape: (5000, 102660)
Loaded 12497500 ground truth pairs

Processing Threshold 0.1
  Valid Pairs: 30908
  > Running BinSketch...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  > Running BCS...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32


### Experiments on BBC to calculate $-\log(MSE)$ using Jaccard Similarity

In [18]:
DATASET = 'bbc'
SIMILARITY_SCORE = 'jaccard_similarity'
METRIC = 'minus_log_mse'
data_path = f'./data/{DATASET}_binary.npy'
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])

In [19]:
!python save_ground_truth.py --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/bbc_binary.npy...
  Matrix Shape: (2225, 9635)
Calculating Ground Truth (jaccard_similarity) for 2474200 pairs...
  Using GPU-accelerated batch computation...
Computing ground truth: 100%|██████████████| 248/248 [00:03<00:00, 70.58batch/s]

Saving ground truth to ground_truth_bbc_binary_jaccard_similarity.json...
✓ Saved 2474200 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.065442


In [20]:
!python main.py --algo BinSketch BCS MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_{DATASET}_binary_{SIMILARITY_SCORE}.json \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --eval_metric {METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/bbc_binary.npy...
  Matrix Shape: (2225, 9635)
Loading ground truth from ground_truth_bbc_binary_jaccard_similarity.json...
✓ Loaded 2474200 ground truth values
  Dataset: ./data/bbc_binary.npy
  Similarity: jaccard_similarity
  Shape: (2225, 9635)
Loaded 2474200 ground truth pairs

Processing Threshold 0.1
  Valid Pairs: 258072
  > Running BinSketch...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  > Running BCS...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Convertin

## Experiment 2: Ranking