#  1. Get similarity Matrix

run the following cell to specify what data to use

In [None]:
import os
import numpy as np

# input the data folder structured as the following
folder_name = "example"
judge_folder = f"{folder_name}/Input/Judge_Folder"
venture_folder = f"{folder_name}/Input/Venture_Folder"

# create output folder  
os.makedirs(f"{folder_name}/Output", exist_ok=True)

def print_max_pair(matrix: np.ndarray, judge_to_ind: dict, venture_to_ind: dict):
    '''prints the matched judge-venture pair that has the highest similarity score'''
    judge_i, venture_i = np.unravel_index(np.argmax(matrix), matrix.shape)
    max_sim_score = np.max(matrix)
    ind_to_judge = {ind:judge for judge, ind in judge_to_ind.items()}
    ind_to_venture = {ind:venture for venture, ind in venture_to_ind.items()}
    print('Similarity score:', max_sim_score)
    with open(f'{folder_name}/Input/Judge_Folder/{ind_to_judge[judge_i]}.txt', 'r') as f:
        print(f'Judge {ind_to_judge[judge_i]}.txt, {judge_i}')
        print(f.read())
    with open(f'{folder_name}/Input/Venture_Folder/{ind_to_venture[venture_i]}.txt', 'r') as f:
        print(f'Venture {ind_to_venture[venture_i]}.txt, {venture_i}')
        print(f.read())

## Using TFIDF
### 1. Use augmented TFIDF

In [None]:
from similarity.tfidf_sim import get_aug_tfidf, tfidf_sim_aug
from preprocess_data import get_parsed_data

def get_tfidf_sim_aug(judge_folder, venture_folder, wiki_folder, stem, sanitize, lines, keep_ui):
    '''get the similarity matric using augmented tfidf'''
    judges, ventures = get_parsed_data(judge_folder, venture_folder, stem, sanitize, lines, keep_ui)
    judge_to_ind = {judge:ind for ind, judge in enumerate(judges.keys())}
    venture_to_ind = {venture:ind for ind, venture in enumerate(ventures.keys())}
    idf, _, = get_aug_tfidf([judges, ventures], wiki_folder, stem, sanitize, lines, keep_ui)
    similarity_matrix = -np.ones((len(judges), len(ventures)))

    for judge, j_ind in judge_to_ind.items():
        for venture, v_ind in venture_to_ind.items():
            j_counter, v_counter = judges[judge], ventures[str(venture)]

            if len(j_counter) != 0 and len(v_counter) != 0:
                similarity_matrix[j_ind, v_ind] = tfidf_sim_aug(
                    j_counter, v_counter, idf
                )
    return similarity_matrix, judge_to_ind, venture_to_ind

# get the similarity matrix and the judge and venture mappings
similarity_matrix_tfidf_aug, judge_to_ind, venture_to_ind = get_tfidf_sim_aug(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                wiki_folder=None, # input None or the wikipedia folder to supplement the idf
                                                                stem=True, # whether we stem when preprocessing the data
                                                                sanitize=True, # whether we sanitize when preprocessing the data
                                                                lines=False, # whether we want to input a str passage (lines=True) or a list of str words (lines=False)
                                                                keep_ui=False) # whether to keep the uninformative words or not
# save the similarity matrix
np.savetxt(f"{folder_name}/Output/tfidf_aug_similarities.txt", similarity_matrix_tfidf_aug)



In [7]:
print_max_pair(similarity_matrix_tfidf_aug, judge_to_ind, venture_to_ind)

Similarity score: 0.08026348218648928
Judge 2.txt, 4
Dr. Kenji serves as CTO-in-Residence at FutureHealth Capital, where he advises portfolio companies at the intersection of life sciences and machine learning. A former practicing physician and biomedical researcher, Kenji has founded and exited two medtech startups, bringing a wealth of experience in product development, FDA navigation, and scientific validation. In judging settings, he is both compassionate and deeply analytical, often challenging founders on their clinical claims, data robustness, and regulatory readiness.
Venture 4.txt, 1
Neuron is a neurotechnology startup founded in 2025 and based in Toronto. The company is developing a wearable EEG headband that integrates with a personalized neuro-coaching platform, helping users improve focus, sleep, and cognitive performance. Backed by $2.4 million in early-stage funding, Neuron combines machine learning with neural signal processing, positioning itself as a category-defining

### 2. Use vanilla TFIDF with IDF smoothing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel
from preprocess_data import parse_info, get_parsed_data

wiki_folder = None
stem = True
sanitize = True
lines = True
keep_ui = False

def get_tfidf_sim(judge_folder, venture_folder, wiki_folder, stem, sanitize, lines, keep_ui):
    '''get the similarity matric using vanilla tfidf with idf smoothing'''
    count_vectorizer = CountVectorizer(stop_words="english")
    tfidf = TfidfTransformer(
        smooth_idf=True, use_idf=True, norm="l2", sublinear_tf=False
    )

    judges, ventures = get_parsed_data(judge_folder, venture_folder, stem, sanitize, lines, keep_ui)
    judge_to_ind = {judge:ind for ind, judge in enumerate(judges.keys())}
    venture_to_ind = {venture:ind for ind, venture in enumerate(ventures.keys())}

    judge_values = list(judges.values())
    venture_values = list(ventures.values())

    if wiki_folder is not None:
        wiki_dic = parse_info(wiki_folder, stem, sanitize, lines, keep_ui)
        wiki_values = list(wiki_dic.values())
        word_count_vec = count_vectorizer.fit_transform(
            judge_values + venture_values + wiki_values
        )
    else:
        word_count_vec = count_vectorizer.fit_transform(judge_values + venture_values)

    tfidf.fit(word_count_vec)
    count_vec = count_vectorizer.transform(judge_values + venture_values)
    tfidf_vec = tfidf.transform(count_vec)
    similarity_matrix = linear_kernel(tfidf_vec, tfidf_vec)[
        : len(judge_values), len(judge_values) :
    ]
    return similarity_matrix, judge_to_ind, venture_to_ind 

# get the similarity matrix and the judge and venture mappings
similarity_matrix_tfidf, judge_to_ind, venture_to_ind = get_tfidf_sim(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                wiki_folder=None, # input None or the wikipedia folder to supplement the idf
                                                                stem=True, # whether we stem when preprocessing the data
                                                                sanitize=True, # whether we sanitize when preprocessing the data
                                                                lines=True, # whether we want to input a str passage (lines=True) or a list of str words (lines=False)
                                                                keep_ui=False) # whether to keep the uninformative words or not

# save the similarity matrix
np.savetxt(f"{folder_name}/Output/tfidf_similarities.txt", similarity_matrix_tfidf)


In [9]:
print_max_pair(similarity_matrix_tfidf, judge_to_ind, venture_to_ind)

Similarity score: 0.1067768235868722
Judge 2.txt, 4
Dr. Kenji serves as CTO-in-Residence at FutureHealth Capital, where he advises portfolio companies at the intersection of life sciences and machine learning. A former practicing physician and biomedical researcher, Kenji has founded and exited two medtech startups, bringing a wealth of experience in product development, FDA navigation, and scientific validation. In judging settings, he is both compassionate and deeply analytical, often challenging founders on their clinical claims, data robustness, and regulatory readiness.
Venture 4.txt, 1
Neuron is a neurotechnology startup founded in 2025 and based in Toronto. The company is developing a wearable EEG headband that integrates with a personalized neuro-coaching platform, helping users improve focus, sleep, and cognitive performance. Backed by $2.4 million in early-stage funding, Neuron combines machine learning with neural signal processing, positioning itself as a category-defining 

## Using Embeddings

In [None]:
import time
import pandas as pd
from similarity.embed_sim import get_tokenizer_and_model, tokenize, token_similarity
from preprocess_data import get_parsed_data
from sklearn.metrics.pairwise import cosine_similarity


def get_embedding_sim(judge_folder, venture_folder, model, stem, sanitize, lines, keep_ui, token_level):
    '''get the similarity matric using embeddings'''
    judges, ventures = get_parsed_data(judge_folder, venture_folder, stem, sanitize, lines, keep_ui)
    judge_to_ind = {judge:ind for ind, judge in enumerate(judges.keys())}
    venture_to_ind = {venture:ind for ind, venture in enumerate(ventures.keys())}

    judge_rows = [(judge_to_ind[judge], judges[judge]) for judge in judge_to_ind]
    judge_df = pd.DataFrame(judge_rows, columns=["index", "info"])
    venture_rows = [(venture_to_ind[venture], ventures[venture]) for venture in venture_to_ind]
    venture_df = pd.DataFrame(venture_rows, columns=["index", "info"])

    tokenizer, pretrained_model = get_tokenizer_and_model(model)
    judge_embeddings, mean_judge_embed, judge_mask = tokenize(judge_df, model, tokenizer, pretrained_model)
    venture_embeddings, mean_venture_embed, venture_mask = tokenize(venture_df, model, tokenizer, pretrained_model)
    start = time.time()
    if not token_level:
        similarity_matrix = cosine_similarity(mean_judge_embed.cpu().numpy(), mean_venture_embed.cpu().numpy())
    else:
        similarity_matrix = token_similarity(judge_embeddings, judge_mask, venture_embeddings, venture_mask)
    end = time.time()
    print(f'Calculating similarity matrix took: {round(end-start, 2)} seconds')
    return similarity_matrix, judge_to_ind, venture_to_ind

# get the similarity matrix and the judge and venture mappings
similarity_matrix_embed_token, judge_to_ind, venture_to_ind = get_embedding_sim(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                model='allenai/scibert_scivocab_uncased', # the name of the model used 
                                                                stem=True, # whether we stem when preprocessing the data
                                                                sanitize=True, # whether we sanitize when preprocessing the data
                                                                lines=True, # whether we want to input a str passage (lines=True) or a list of str words (lines=False)
                                                                keep_ui=True, # whether to keep the uninformative words or not
                                                                token_level=True) # whether to use token level similarity or not 
# save the similarity matrix
np.savetxt(f"{folder_name}/Output/embed_token_similarities.txt", similarity_matrix_embed_token)
# get the similarity matrix and the judge and venture mappings
similarity_matrix_embed, judge_to_ind, venture_to_ind = get_embedding_sim(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                model='allenai/scibert_scivocab_uncased',
                                                                stem=True, 
                                                                sanitize=True, 
                                                                lines=True, 
                                                                keep_ui=True,
                                                                token_level=False)
# save the similarity matrix
np.savetxt(f"{folder_name}/Output/embed_similarities.txt", similarity_matrix_embed)

  from .autonotebook import tqdm as notebook_tqdm


Tokenization took 0.34 seconds
Tokenization took 0.27 seconds
Calculating similarity matrix took: 0.01 seconds
Tokenization took 0.44 seconds
Tokenization took 0.61 seconds
Calculating similarity matrix took: 0.0 seconds


In [11]:
print_max_pair(np.array(similarity_matrix_embed), judge_to_ind, venture_to_ind)

Similarity score: 0.9153174
Judge 1.txt, 5
E.T is a General Partner at Meridian Ventures, where she leads investments in climate tech and industrial innovation. Prior to her transition into venture capital, Elena spent a decade as a VP at Amazon Logistics, giving her a deep understanding of supply chains and operational scaling. She is known for her sharp diligence and no-nonsense feedback, often honing in on unit economics, go-to-market assumptions, and the scalability of hardware solutions. Her portfolio includes companies like Amply Power, Rivertown Robotics, and TerraMat.
Venture 4.txt, 1
Neuron is a neurotechnology startup founded in 2025 and based in Toronto. The company is developing a wearable EEG headband that integrates with a personalized neuro-coaching platform, helping users improve focus, sleep, and cognitive performance. Backed by $2.4 million in early-stage funding, Neuron combines machine learning with neural signal processing, positioning itself as a category-defining

In [12]:
print_max_pair(np.array(similarity_matrix_embed_token), judge_to_ind, venture_to_ind)

Similarity score: 0.5268329977989197
Judge 4.txt, 1
Dr. Priya is a neuroscientist turned venture capitalist, currently a Partner at Synthesis Capital where she leads investments in neurotech, brain-computer interfaces, and digital mental health. A former professor at Oxford and author of multiple papers on real-time EEG signal decoding, she brings scientific rigor and high expectations to any founder pitching in the brain-health space. She’s particularly drawn to ventures blend deep science with consumer application, but is known for grilling teams hard on clinical validity and IP defensibility.
Venture 4.txt, 1
Neuron is a neurotechnology startup founded in 2025 and based in Toronto. The company is developing a wearable EEG headband that integrates with a personalized neuro-coaching platform, helping users improve focus, sleep, and cognitive performance. Backed by $2.4 million in early-stage funding, Neuron combines machine learning with neural signal processing, positioning itself as

## Using Hybrid Method

In [None]:
import pandas as pd
import numpy as np
from similarity.tfidf_sim import get_aug_tfidf, get_smoothed_tfidf
from similarity.hybrid_sim import tokenize_hybrid, get_hybrid_sim
from similarity.embed_sim import get_tokenizer_and_model
from preprocess_data import get_parsed_data


def get_hybrid_embeddings(judge_folder, venture_folder, wiki_folder, model, augmented_idf, stem, sanitize, lines, keep_ui):
    '''get the similarity matric using embeddings'''
    judges, ventures = get_parsed_data(judge_folder, venture_folder, stem=True, sanitize=True, lines=False, keep_ui=False)
    judge_to_ind = {judge:ind for ind, judge in enumerate(judges.keys())}
    venture_to_ind = {venture:ind for ind, venture in enumerate(ventures.keys())}

    if augmented_idf:
        idf, _ = get_aug_tfidf([judges, ventures], wiki_folder, stem, sanitize, lines=False, keep_ui=keep_ui)
    else:
        idf = get_smoothed_tfidf(judge_folder, venture_folder, wiki_folder, stem, sanitize, lines=True, keep_ui=keep_ui)

    judge_line, venture_line = get_parsed_data(judge_folder, venture_folder, stem, sanitize, lines, keep_ui)
    judge_rows = [(judge_to_ind[judge], judge_line[judge]) for judge in judge_to_ind]
    judge_df = pd.DataFrame(judge_rows, columns=["index", "info"])
    venture_rows = [(venture_to_ind[venture], venture_line[venture]) for venture in venture_to_ind]
    venture_df = pd.DataFrame(venture_rows, columns=["index", "info"])

    tokenizer, pretrained_model = get_tokenizer_and_model(model)

    judge_embeddings, judge_idf_weights, judge_mask = tokenize_hybrid(
        judge_df, model, tokenizer, pretrained_model, idf)
    
    venture_embeddings, venture_idf_weights, venture_mask= tokenize_hybrid(
        venture_df, model, tokenizer, pretrained_model, idf)

    return judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, judge_to_ind, venture_to_ind


In [None]:
# get the embeddings, weights, masks, and mappings for the judge and ventures
judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, judge_to_ind, venture_to_ind = get_hybrid_embeddings(
                                                                            judge_folder=judge_folder, 
                                                                            venture_folder=venture_folder, 
                                                                            wiki_folder=None, # input None or the wikipedia folder to supplement the idf
                                                                            model='bert-base-uncased', # the name of the model used 
                                                                            augmented_idf=False, # whether to use the augmented idf or use the vanilla idf
                                                                            stem=True, # whether we stem when preprocessing the data
                                                                            sanitize=True, # whether we sanitize when preprocessing the data
                                                                            lines=True, # whether we want to input a str passage (lines=True) or a list of str words (lines=False)
                                                                            keep_ui=True) # whether to keep the uninformative words or not

# find the similarity matrix
# toke_level is a bool to indicate whether to use token level similarity or not
# linear_kernel is a bool to indicate whether to use normalization (linear_kernel=False) or not (linear_kernel=True)
similarity_matrix_token_kernel = get_hybrid_sim(judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, token_level=True, linear_kernel=True)
similarity_matrix_token = get_hybrid_sim(judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, token_level=True, linear_kernel=False)
similarity_matrix = get_hybrid_sim(judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, token_level=False, linear_kernel=False)
# save the similarity matrix
np.savetxt(f"{folder_name}/Output/hybrid_similarities.txt", similarity_matrix)
np.savetxt(f"{folder_name}/Output/hybrid_token_similarities.txt", similarity_matrix_token)
np.savetxt(f"{folder_name}/Output/hybrid_token_kernel_similarities.txt", similarity_matrix_token_kernel)

Hybrid tokenization took 1.1 seconds
Hybrid tokenization took 0.43 seconds
Calculating similarities took: 0.01 seconds
Calculating similarities took: 0.09 seconds
Calculating similarities took: 0.0 seconds


In [None]:
print_max_pair(similarity_matrix_token_kernel, judge_to_ind, venture_to_ind)

In [None]:
print_max_pair(similarity_matrix_token, judge_to_ind, venture_to_ind)

In [None]:
print_max_pair(similarity_matrix, judge_to_ind, venture_to_ind)

# 2. Get Evaluations
## Example Use Case
here we randomly generate a manual score dic and perform the evaluations. For actual evaluations, the manual score dic contains score that are manually curated by expert humans on the match quality of judge venture pairs. 

In [None]:
from pathlib import Path
import numpy as np
def get_random_manual_scores(folder_name):
    '''generate random manual scores for judge venture pairs'''
    manual_scores = {}
    judge_folder = f"{folder_name}/Input/Judge_Folder"
    venture_folder = f"{folder_name}/Input/Venture_Folder"

    j_folder_path = Path(judge_folder)
    v_folder_path = Path(venture_folder)
    n_judges = len([i for i in j_folder_path.glob("*.txt")])
    n_ventures = len([i for i in v_folder_path.glob("*.txt")])
    for i in range(n_judges):
        for j in range(n_ventures):
            manual_scores[(i, j)] = np.random.randint(1,5)
    return manual_scores

manual_scores = get_random_manual_scores('example')

In [None]:
from evaluate import get_percentile, get_scoremat_from_sim, get_ranking

# load the similarity matrix
sim_mat = np.loadtxt("example/Output/embed_similarities.txt")
# define output path
output_path = "example/Output/embed_kendall_tau.txt"
# get the percentile distribution of scores of the manual score
percentiles = get_percentile(manual_scores)
# use the similarity matrix and transform it into a score matrix with values of 1 - 5 instead of simliarity scores
scoremat_from_sim = get_scoremat_from_sim(sim_mat, percentiles)
# get the kendall tau-b rank coefficients between the manual scores and algorithmic scores
score_ranks = get_ranking(sim_mat, manual_scores, percentiles, scoremat_from_sim, output_path)
with open(output_path, 'r') as f:
    print(f.read())

Ranking based on manually labeled scores vs similarity-based buckets
0.09, p=0.517

Manual scores vs full similarity rank order
0.06, p=0.650

Pair: (0, 0)
Manual: 4, Bucket Rank: 4.0, Full Rank: 4.0, Rank Index: 33, Sim Value: 0.8998345136642456

Pair: (0, 1)
Manual: 4, Bucket Rank: 3.0, Full Rank: 3.0, Rank Index: 22, Sim Value: 0.8861034512519836

Pair: (0, 2)
Manual: 2, Bucket Rank: 2.0, Full Rank: 2.0, Rank Index: 7, Sim Value: 0.8570132851600647

Pair: (0, 3)
Manual: 2, Bucket Rank: 1.0, Full Rank: 2.0, Rank Index: 6, Sim Value: 0.8565527200698853

Pair: (0, 4)
Manual: 2, Bucket Rank: 4.0, Full Rank: 4.0, Rank Index: 32, Sim Value: 0.8997251391410828

Pair: (0, 5)
Manual: 4, Bucket Rank: 1.0, Full Rank: 2.0, Rank Index: 4, Sim Value: 0.8552368879318237

Pair: (1, 0)
Manual: 3, Bucket Rank: 4.0, Full Rank: 3.0, Rank Index: 28, Sim Value: 0.8946714401245117

Pair: (1, 1)
Manual: 4, Bucket Rank: 4.0, Full Rank: 4.0, Rank Index: 34, Sim Value: 0.9126430153846741

Pair: (1, 2)
Manual:

# 3. Ensemble Learning Optimization

In [None]:
from ensemble import optimize_similarity, get_score_difference

from sklearn.model_selection import KFold
folder_name = 'example'
filenames = ['example/Output/embed_similarities.txt', 
             'example/Output/hybrid_similarities.txt', 
             'example/Output/tfidf_similarities.txt']
# get a random set of manual scores
manual_scores = get_random_manual_scores(folder_name)
# define k fold 
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
avg = []

score_keys = list(manual_scores.keys())
score_vals = list(manual_scores.values())
all_weights = []
# iterate through each fold
for i, (train_index, test_index) in enumerate(kfold.split(score_keys)):
    print("\nCross-validation", i)
    # get the train data and test data 
    train_score_dic = {
        score_keys[j]: score_vals[j] for j in train_index
    }
    test_score_dic = {
        score_keys[j]: score_vals[j] for j in test_index
    }

    # use MLE to optimize over the algorithmic methods
    weights, X_train, y_train, train_mean_diff = optimize_similarity(filenames, train_score_dic)
    # get the test loss
    test_mean_diff = get_score_difference(filenames, test_score_dic, weights)
    all_weights.append(weights)
    print("Average Train Difference:", train_mean_diff)
    print('Average Test Difference:', test_mean_diff)

# get the average of weights from the 5 fold 
all_weights = np.array(all_weights)
avg_weights = np.mean(all_weights, axis=0)
diff = get_score_difference(filenames, manual_scores, avg_weights)
print("\nFinal Avg Weight Diff:", diff)


Cross-validation 0
Weights: [1.06058716e-15 5.34692103e-01 4.65307897e-01]
Average Train Difference: 1.0714285714285714
Average Test Difference: 1.0

Cross-validation 1
Weights: [1.59904488e-14 5.26941188e-01 4.73058812e-01]
Average Train Difference: 0.8275862068965517
Average Test Difference: 0.2857142857142857

Cross-validation 2
Weights: [2.10950008e-17 5.55907686e-01 4.44092314e-01]
Average Train Difference: 1.3103448275862069
Average Test Difference: 0.0

Cross-validation 3
Weights: [5.88258238e-15 5.38859245e-01 4.61140755e-01]
Average Train Difference: 1.1724137931034482
Average Test Difference: 1.4285714285714286

Cross-validation 4
Weights: [0.         0.54159909 0.45840091]
Average Train Difference: 1.103448275862069
Average Test Difference: 1.1428571428571428

Final Avg Weight Diff: 1.1666666666666667
