# Set up

In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")

In [2]:
BRANCH_NAME = 'feature/gpu-acceleration'
!git clone -b {BRANCH_NAME} https://{GITHUB_TOKEN}@github.com/tadtd/binsketch-algorithm

Cloning into 'binsketch-algorithm'...
remote: Enumerating objects: 295, done.[K
remote: Counting objects: 100% (295/295), done.[K
remote: Compressing objects: 100% (202/202), done.[K
remote: Total 295 (delta 177), reused 201 (delta 87), pack-reused 0 (from 0)[K
Receiving objects: 100% (295/295), 166.63 KiB | 5.95 MiB/s, done.
Resolving deltas: 100% (177/177), done.


In [3]:
%cd binsketch-algorithm

/kaggle/working/binsketch-algorithm


# Configuration

In [4]:
import os
import gdown
import zipfile

# Process data

In [5]:
DRIVE_URL = 'https://drive.google.com/drive/folders/1ARBY9cIGj_jigi5Y88CtUy-GMj2clrXj'
RAW_DATA_PATH = './raw'
PROCESSED_DATA_PATH = './data'

In [6]:
def download_dataset(drive_url, target_folder):
    print(f"Processing: {drive_url}")
    
    # Create the folder if it doesn't exist
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    if "drive/folders" in drive_url or "folder" in drive_url:
        print("Downloading individual files directly...")
        gdown.download_folder(drive_url, output=target_folder, quiet=False, use_cookies=False)
        print(f"\n[SUCCESS] Folder contents downloaded to: {target_folder}")
    else:
        print("\n[INFO] Detected a Drive FILE link.")
        zip_path = os.path.join(target_folder, "temp_dataset.zip")
        output = gdown.download(drive_url, zip_path, quiet=False, fuzzy=True)
        
        if not output:
            print("[ERROR] Download failed.")
            return

        print(f"\nUnzipping {output}...")
        try:
            with zipfile.ZipFile(output, 'r') as zip_ref:
                zip_ref.extractall(target_folder)
            print(f"[SUCCESS] Extracted to: {target_folder}")
            
            # Clean up the zip file
            os.remove(output)
            
        except zipfile.BadZipFile:
            print("[ERROR] The downloaded file was not a valid zip file.")
            print("Check if the file on Drive is actually a .zip archive.")

In [7]:
download_dataset(DRIVE_URL, RAW_DATA_PATH)

Processing: https://drive.google.com/drive/folders/1ARBY9cIGj_jigi5Y88CtUy-GMj2clrXj
Downloading individual files directly...


Retrieving folder contents


Retrieving folder 10Y_7o78v8HhztcE7bzgV3M94N0vXjt_U bbc
Processing file 1lgysq7G_lc_zc71dGGqWoHejDCixcOVr docword.bbc.txt.gz
Processing file 1kO9AOWWuACtNsgmA9yyp9C6pSoB8KW3U vocab.bbc.txt
Retrieving folder 1D9mMF6ealOinLAsmwXsZs9IMNPBqzuM0 enron
Processing file 1JuUxpaQRAl1yZGqb3xcSP8nfK3_1NZ8y docword.enron.txt.gz
Processing file 16Rn70xrTnYOIkVm2mz4ICR2bEbyzyvGQ vocab.enron.txt
Retrieving folder 1YMxNXk2-7Ok1_3gnIguPwWt365C5X7Et kos
Processing file 1c1bJ-eX5Rp729zGSeGGXqwyor4DfWsrx docword.kos.txt.gz
Processing file 1YL0wnFKLJz-h6emVWHYDfBET5cAZhdKi vocab.kos.txt
Retrieving folder 17JU-ouMBLAUilZiaxE1xzeKfpFvDy9PA nytimes
Processing file 1gsmnfyNEAtA_3kdU5GMhwnUX-vOlc_9A docword.nytimes.txt.gz
Processing file 1jAnAFekn8u-e_FO1tsElhr-gP_dvSXkx vocab.nytimes.txt


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1lgysq7G_lc_zc71dGGqWoHejDCixcOVr
To: /kaggle/working/binsketch-algorithm/raw/bbc/docword.bbc.txt.gz
100%|██████████| 490k/490k [00:00<00:00, 115MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kO9AOWWuACtNsgmA9yyp9C6pSoB8KW3U
To: /kaggle/working/binsketch-algorithm/raw/bbc/vocab.bbc.txt
100%|██████████| 77.2k/77.2k [00:00<00:00, 53.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JuUxpaQRAl1yZGqb3xcSP8nfK3_1NZ8y
To: /kaggle/working/binsketch-algorithm/raw/enron/docword.enron.txt.gz
100%|██████████| 12.3M/12.3M [00:00<00:00, 13.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=16Rn70xrTnYOIkVm2mz4ICR2bEbyzyvGQ
To: /kaggle/working/binsketch-algorithm/raw/enron/vocab.enron.txt
100%|██████████| 236k/236k [00:00<00:00, 83.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1c1bJ-eX5Rp729zGSeGGXqwyor


[SUCCESS] Folder contents downloaded to: ./raw



Download completed


In [8]:
!python convert.py


Processing BBC dataset
Loading vocabulary from raw/bbc/vocab.bbc.txt...
Loading document-word data from raw/bbc/docword.bbc.txt.gz...
  Documents: 2225, Words: 9635, Non-zeros: 158079
Creating sparse matrix...
Reading sparse data: 100%|████| 158079/158079 [00:00<00:00, 331618.66 entries/s]
Converting to DataFrame in batches of 500 documents...
Processing batches: 100%|█████████████████████| 5/5 [00:00<00:00, 89.16 batch/s]
Concatenating batches...
DataFrame shape: (2225, 9635)
Sparsity: 99.26%

BBC DataFrame Preview:
             ad  sale  boost  time  ...  quarterli  media  giant  jump
document_id                         ...                               
0             1     1      1     1  ...          0      1      1     0
1             0     0      1     1  ...          0      0      0     0
2             0     1      0     0  ...          0      0      1     0
3             0     0      0     0  ...          0      0      0     0
4             0     0      0  

# Experiment

## Experiment 1: Accuracy of Estimation

### Experiment on NYTimes to calculate $MSE$ using Inner Product

In [9]:
DATASET = 'nytimes'
SIMILARITY_SCORE = 'inner_product'
METRIC = 'mse'
data_path = f'./data/{DATASET}_binary.npy'
THRESHOLD = [120, 150, 180, 200, 220, 250, 270, 300]
threshold_str = ' '.join([str(t) for t in THRESHOLD])

In [10]:
!python save_ground_truth.py --experiment 1 \
                             --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --seed 42 \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
  Data type: int8
  Value range: [0, 1]
Transferring data to GPU...
✓ Data transferred to GPU (using cupy)
Calculating Ground Truth (inner_product) for 12497500 pairs...
  Using GPU-accelerated batch computation...
Computing ground truth: 100%|████████████████| 10/10 [00:00<00:00, 23.48batch/s]
Extracting pair similarities...

Saving ground truth to ground_truth_nytimes_inner_product.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 775.000000
  Mean similarity: 12.607225


In [11]:
!python main.py --experiment 1 \
                --algo BinSketch BCS \
                --data_path {data_path} \
                --ground_truth_path ground_truth_exp1_{DATASET}_{SIMILARITY_SCORE}.json \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --eval_metric {METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Running Experiment 1: Accuracy of Similarity Estimation
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
Calculating ground truth (this may take a while)...
Calculating Ground Truth (inner_product) for 12497500 pairs...
Computing ground truth: 100%|████████████████| 10/10 [00:27<00:00,  2.72s/batch]
Extracting pair similarities...
Saving ground truth to experiment/ground_truth/ground_truth_exp1_nytimes_inner_product.json...

Saving ground truth to experiment/ground_truth/ground_truth_exp1_nytimes_inner_product.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 775.000000
  Mean similarity: 12.607225
Saved 12497500 ground truth pairs

Processing Threshold 120.0
  Valid Pairs: 1734
  > Running BinSketch...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 t

### Experiments on ENRON to calculate $-\log(MSE)$ using Cosine Similarity

In [12]:
DATASET = 'enron'
SIMILARITY_SCORE = 'cosine_similarity'
METRIC = 'minus_log_mse'
data_path = f'./data/{DATASET}_binary.npy'
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])

In [13]:
!python save_ground_truth.py --experiment 1 \
                             --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --seed 42 \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/enron_binary.npy...
  Matrix Shape: (5000, 28102)
  Data type: int8
  Value range: [0, 1]
Transferring data to GPU...
✓ Data transferred to GPU (using cupy)
Calculating Ground Truth (cosine_similarity) for 12497500 pairs...
  Using GPU-accelerated batch computation...
Computing ground truth: 100%|████████████████| 10/10 [00:00<00:00, 17.72batch/s]
Extracting pair similarities...

Saving ground truth to ground_truth_enron_cosine_similarity.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000005
  Mean similarity: 0.026620


In [14]:
!python main.py --experiment 1 \
                --algo BinSketch SimHash MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_exp1_{DATASET}_{SIMILARITY_SCORE}.json \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --eval_metric {METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Running Experiment 1: Accuracy of Similarity Estimation
Loading matrix from ./data/enron_binary.npy...
  Matrix Shape: (5000, 28102)
Calculating ground truth (this may take a while)...
Calculating Ground Truth (cosine_similarity) for 12497500 pairs...
Computing ground truth: 100%|████████████████| 10/10 [00:12<00:00,  1.21s/batch]
Extracting pair similarities...
Saving ground truth to experiment/ground_truth/ground_truth_exp1_enron_cosine_similarity.json...

Saving ground truth to experiment/ground_truth/ground_truth_exp1_enron_cosine_similarity.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.026620
Saved 12497500 ground truth pairs

Processing Threshold 0.1
  Valid Pairs: 332313
  > Running BinSketch...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8

### Experiments on NYTimes to calculate $- \log(MSE)$ using Jaccard Similarity

In [15]:
DATASET = 'nytimes'
SIMILARITY_SCORE = 'jaccard_similarity'
METRIC = 'minus_log_mse'
data_path = f'./data/{DATASET}_binary.npy'
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])

In [16]:
!python save_ground_truth.py --experiment 1 \
                             --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --seed 42 \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
  Data type: int8
  Value range: [0, 1]
Transferring data to GPU...
✓ Data transferred to GPU (using cupy)
Calculating Ground Truth (jaccard_similarity) for 12497500 pairs...
  Using GPU-accelerated batch computation...
Computing ground truth: 100%|████████████████| 10/10 [00:00<00:00, 19.37batch/s]
Extracting pair similarities...

Saving ground truth to ground_truth_nytimes_jaccard_similarity.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.027982


In [17]:
!python main.py --experiment 1 \
                --algo BinSketch BCS MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_exp1_{DATASET}_{SIMILARITY_SCORE}.json \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --eval_metric {METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Running Experiment 1: Accuracy of Similarity Estimation
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
Calculating ground truth (this may take a while)...
Calculating Ground Truth (jaccard_similarity) for 12497500 pairs...
Computing ground truth: 100%|████████████████| 10/10 [00:30<00:00,  3.00s/batch]
Extracting pair similarities...
Saving ground truth to experiment/ground_truth/ground_truth_exp1_nytimes_jaccard_similarity.json...

Saving ground truth to experiment/ground_truth/ground_truth_exp1_nytimes_jaccard_similarity.json...
✓ Saved 12497500 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.027982
Saved 12497500 ground truth pairs

Processing Threshold 0.1
  Valid Pairs: 30908
  > Running BinSketch...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data 

### Experiments on BBC to calculate $-\log(MSE)$ using Jaccard Similarity

In [18]:
DATASET = 'bbc'
SIMILARITY_SCORE = 'jaccard_similarity'
METRIC = 'minus_log_mse'
data_path = f'./data/{DATASET}_binary.npy'
THRESHOLD = [.1, .2, .3, .4, .5, .7, .8, .9]
threshold_str = ' '.join([str(t) for t in THRESHOLD])

In [19]:
!python save_ground_truth.py --experiment 1 \
                             --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --seed 42 \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/bbc_binary.npy...
  Matrix Shape: (2225, 9635)
  Data type: int8
  Value range: [0, 1]
Transferring data to GPU...
✓ Data transferred to GPU (using cupy)
Calculating Ground Truth (jaccard_similarity) for 2474200 pairs...
  Using GPU-accelerated batch computation...
Computing ground truth: 100%|██████████████████| 5/5 [00:00<00:00, 40.40batch/s]
Extracting pair similarities...

Saving ground truth to ground_truth_bbc_jaccard_similarity.json...
✓ Saved 2474200 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.065442


In [20]:
!python main.py --experiment 1 \
                --algo BinSketch BCS MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_exp1_{DATASET}_{SIMILARITY_SCORE}.json \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --eval_metric {METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Running Experiment 1: Accuracy of Similarity Estimation
Loading matrix from ./data/bbc_binary.npy...
  Matrix Shape: (2225, 9635)
Calculating ground truth (this may take a while)...
Calculating Ground Truth (jaccard_similarity) for 2474200 pairs...
Computing ground truth: 100%|██████████████████| 5/5 [00:00<00:00,  7.84batch/s]
Extracting pair similarities...
Saving ground truth to experiment/ground_truth/ground_truth_exp1_bbc_jaccard_similarity.json...

Saving ground truth to experiment/ground_truth/ground_truth_exp1_bbc_jaccard_similarity.json...
✓ Saved 2474200 ground truth values
  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.065442
Saved 2474200 ground truth pairs

Processing Threshold 0.1
  Valid Pairs: 258072
  > Running BinSketch...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to flo

## Experiment 2: Ranking

### Experiments on ENRON to calculate Accuracy using Jaccard Similarity

In [21]:
THRESHOLD = [.1, .2, .4, .5, .6, .7, .85, .95]
threshold_str = ' '.join([str(t) for t in THRESHOLD])
RETRIEVAL_METRIC = 'accuracy'
DATASET = 'enron'
data_path = f'./data/{DATASET}_binary.npy'
SIMILARITY_SCORE = 'jaccard_similarity'

In [22]:
!python save_ground_truth.py --experiment 2 \
                             --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --train_ratio .9 \
                             --seed 42 \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/enron_binary.npy...
  Matrix Shape: (5000, 28102)
  Data type: int8
  Value range: [0, 1]
Dataset split: 4500 training, 500 query
Transferring data to GPU...
✓ Data transferred to GPU (using cupy)
Calculating Experiment 2 Ground Truth (jaccard_similarity)...
  Training samples: 4500
  Query samples: 500
  Using GPU-accelerated batch computation...
Computing similarities: 100%|██████████████████| 5/5 [00:00<00:00, 40.39batch/s]

  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.011788

Saving ground truth to experiment/ground_truth/ground_truth_exp2_enron_jaccard_similarity.json...
✓ Saved similarity matrix (500, 4500)


In [23]:
!python main.py --experiment 2 \
                --algo BinSketch BCS MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_exp2_{DATASET}_{SIMILARITY_SCORE}.json \
                --train_ratio .9 \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --retrieval_metric {RETRIEVAL_METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Running Experiment 2: Retrieval Performance Evaluation
GPU acceleration enabled
Loading matrix from ./data/enron_binary.npy...
  Matrix Shape: (5000, 28102)
Dataset split: 4500 training, 500 query samples

Threshold: 0.1
Loading ground truth from experiment/ground_truth/ground_truth_exp2_enron_jaccard_similarity.json...
  Loaded similarity matrix (500, 4500)
Using cached ground truth similarity matrix

BinSketch:
  Compression length k=100
  Compressing data...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  Retrieving neighbors using GPU...
    Accuracy: 0.0056 ± 0.0120
  Compression length k=500
  Compressing data...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  Retrieving neighbors using GPU...
    Accuracy: 0.0988 ± 0.1690
  Compression length k=1000
  Compressing data...
  [GPU] Conve

### Experiments on NYTimes to calculate F1 Score using Jaccard Similarity

In [24]:
THRESHOLD = [.1, .2, .4, .5, .6, .7, .85, .95]
threshold_str = ' '.join([str(t) for t in THRESHOLD])
RETRIEVAL_METRIC = 'f1'
DATASET = 'nytimes'
data_path = f'./data/{DATASET}_binary.npy'
SIMILARITY_SCORE = 'jaccard_similarity'

In [25]:
!python save_ground_truth.py --experiment 2 \
                             --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --train_ratio .9 \
                             --seed 42 \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
  Data type: int8
  Value range: [0, 1]
Dataset split: 4500 training, 500 query
Transferring data to GPU...
✓ Data transferred to GPU (using cupy)
Calculating Experiment 2 Ground Truth (jaccard_similarity)...
  Training samples: 4500
  Query samples: 500
  Using GPU-accelerated batch computation...
Computing similarities: 100%|██████████████████| 5/5 [00:00<00:00, 36.18batch/s]

  Min similarity: 0.000000
  Max similarity: 1.000000
  Mean similarity: 0.027929

Saving ground truth to experiment/ground_truth/ground_truth_exp2_nytimes_jaccard_similarity.json...
✓ Saved similarity matrix (500, 4500)


In [26]:
!python main.py --experiment 2 \
                --algo BinSketch BCS MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_exp2_{DATASET}_{SIMILARITY_SCORE}.json \
                --train_ratio .9 \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --retrieval_metric {RETRIEVAL_METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Running Experiment 2: Retrieval Performance Evaluation
GPU acceleration enabled
Loading matrix from ./data/nytimes_binary.npy...
  Matrix Shape: (5000, 102660)
Dataset split: 4500 training, 500 query samples

Threshold: 0.1
Loading ground truth from experiment/ground_truth/ground_truth_exp2_nytimes_jaccard_similarity.json...
  Loaded similarity matrix (500, 4500)
Using cached ground truth similarity matrix

BinSketch:
  Compression length k=100
  Compressing data...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  Retrieving neighbors using GPU...
    F1: 0.0053 ± 0.0110
  Compression length k=500
  Compressing data...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  Retrieving neighbors using GPU...
    F1: 0.0124 ± 0.0797
  Compression length k=1000
  Compressing data...
  [GPU] Converting s

### Experiments on KOS to calculate F1 Score using Cosine Similarity

In [27]:
THRESHOLD = [.1, .2, .4, .5, .6, .7, .85, .95]
threshold_str = ' '.join([str(t) for t in THRESHOLD])
RETRIEVAL_METRIC = 'f1'
DATASET = 'kos'
data_path = f'./data/{DATASET}_binary.npy'
SIMILARITY_SCORE = 'cosine_similarity'

In [28]:
!python save_ground_truth.py --experiment 2 \
                             --data_path {data_path} \
                             --similarity_score {SIMILARITY_SCORE} \
                             --train_ratio .9 \
                             --seed 42 \
                             --use_gpu

✓ GPU acceleration enabled
Loading matrix from ./data/kos_binary.npy...
  Matrix Shape: (3430, 6906)
  Data type: int8
  Value range: [0, 1]
Dataset split: 3087 training, 343 query
Transferring data to GPU...
✓ Data transferred to GPU (using cupy)
Calculating Experiment 2 Ground Truth (cosine_similarity)...
  Training samples: 3087
  Query samples: 343
  Using GPU-accelerated batch computation...
Computing similarities: 100%|██████████████████| 4/4 [00:00<00:00, 29.52batch/s]

  Min similarity: 0.000000
  Max similarity: 0.977639
  Mean similarity: 0.070053

Saving ground truth to experiment/ground_truth/ground_truth_exp2_kos_cosine_similarity.json...
✓ Saved similarity matrix (343, 3087)


In [29]:
!python main.py --experiment 2 \
                --algo BinSketch BCS MinHash \
                --data_path {data_path} \
                --ground_truth_path ground_truth_exp2_{DATASET}_{SIMILARITY_SCORE}.json \
                --train_ratio .9 \
                --seed 42 \
                --threshold {threshold_str} \
                --similarity_score {SIMILARITY_SCORE} \
                --retrieval_metric {RETRIEVAL_METRIC} \
                --use_gpu

✓ GPU acceleration enabled
Running Experiment 2: Retrieval Performance Evaluation
GPU acceleration enabled
Loading matrix from ./data/kos_binary.npy...
  Matrix Shape: (3430, 6906)
Dataset split: 3087 training, 343 query samples

Threshold: 0.1
Loading ground truth from experiment/ground_truth/ground_truth_exp2_kos_cosine_similarity.json...
  Loaded similarity matrix (343, 3087)
Using cached ground truth similarity matrix

BinSketch:
  Compression length k=100
  Compressing data...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  Retrieving neighbors using GPU...
    F1: 0.2679 ± 0.1489
  Compression length k=500
  Compressing data...
  [GPU] Converting sparse matrix data from int8 to float32
  [GPU] Converting sparse matrix data from int8 to float32
  Retrieving neighbors using GPU...
    F1: 0.2846 ± 0.1482
  Compression length k=1000
  Compressing data...
  [GPU] Converting sparse matri