# 0. Setup Data Folder and Helper Function

In [None]:
import os
import numpy as np

# input the folder name, judge folder, and venture folder
# you can navigate to these folders to see what they contain
folder_name = "example" # CHANGE THIS
judge_folder = f"{folder_name}/Input/Judge_Folder" # CHANGE THIS
venture_folder = f"{folder_name}/Input/Venture_Folder" # CHANGE THIS

# create output folder  
os.makedirs(f"{folder_name}/Output", exist_ok=True)

# HELPER FUNCTION to print the max simliarity pair
def print_max_pair(matrix: np.ndarray, ind_to_judge: dict, ind_to_venture: dict) -> None:
    '''
    args:
        matrix: similarity matrix for judge and ventures
        ind_to_judge: mapping from judge index to judge code
        ind_to_venture: mapping from venture index to venture code 
    '''
    # gets the judge and venture index of the max similarity pair
    judge_i, venture_i = np.unravel_index(np.argmax(matrix), matrix.shape)
    max_sim_score = np.max(matrix)
    # print the judge and venture descriptions and simlilarity score
    print('Similarity score:', max_sim_score)
    with open(f'{folder_name}/Input/Judge_Folder/{ind_to_judge[judge_i]}.txt', 'r') as f:
        print(f'Judge {ind_to_judge[judge_i]}.txt, {judge_i}')
        print(f.read())
    with open(f'{folder_name}/Input/Venture_Folder/{ind_to_venture[venture_i]}.txt', 'r') as f:
        print(f'Venture {ind_to_venture[venture_i]}.txt, {venture_i}')
        print(f.read())

# 1. Compute Similarity Matrices from Base Learners

In this section, we find the similarity matrices from the base learners. We have three types of base learners:

* TF-IDF based.
    * Augmented TF-IDF.
    * TF-IDF with IDF smoothing.
* Transformer-embedding based.
    * Token-level.
    * Document-level.
* Hybrid learners that combine TF-IDF with transforemr-based embeddings.
    * Token-level.
    * Document-level.

## TF-IDF Base Learners
### A. Augmented TF-IDF
 
Modified from https://github.com/niharshah/TFIDFsimilarity.

In [None]:
from similarity.tfidf_sim import compute_tfidf_sim_aug

# get the similarity matrix and the judge and venture mappings from their name to index
similarity_matrix_tfidf_aug, ind_to_judge, ind_to_venture = compute_tfidf_sim_aug(
                                                                judge_folder=judge_folder, # judge folder name
                                                                venture_folder=venture_folder, # venture folder name
                                                                wiki_folder=None, # input None or the wikipedia folder to supplement the idf
                                                                stem=True, # whether we stem when preprocessing the data
                                                                sanitize=True, # whether we sanitize when preprocessing the data
                                                                keep_ui=False) # whether to keep the uninformative words or not when preprocessing the data
# save the similarity matrix
np.savetxt(f"{folder_name}/Output/tfidf_aug_similarities.txt", similarity_matrix_tfidf_aug)
# we can print the judge and venture that have the highest similarity from this similarity matrix
print_max_pair(similarity_matrix_tfidf_aug, ind_to_judge, ind_to_venture)

### B. TF-IDF with IDF smoothing

In [None]:
from similarity.tfidf_sim import compute_tfidf_sim

# get the similarity matrix
similarity_matrix_tfidf, _, _ = compute_tfidf_sim(
                                                                judge_folder=judge_folder, # judge folder name
                                                                venture_folder=venture_folder, # venture folder name
                                                                wiki_folder=None, # input None or the wikipedia folder to supplement the idf
                                                                stem=True, # whether we stem when preprocessing the data
                                                                sanitize=True, # whether we sanitize when preprocessing the data
                                                                keep_ui=False) # whether to keep the uninformative words or not when preprocessing the data

# save the similarity matrix
np.savetxt(f"{folder_name}/Output/tfidf_similarities.txt", similarity_matrix_tfidf)

## Transformer-based Embedding Base Learners

### A. Token-level

For a venture and a judge, token-level finds the cosine similarity between individual token embeddings from the judge and the venture, and takes the mean over all the token-level similarities as the final similarity.

In [None]:
from similarity.embed_sim import compute_embedding_sim

# get the token level similarity matrix
similarity_matrix_embed_token, _, _ = compute_embedding_sim(judge_folder=judge_folder, # judge folder name
                                                                venture_folder=venture_folder, # venture folder name
                                                                model='bert-base-uncased', # the name of the model used from huggingface
                                                                stem=True, # whether we stem when preprocessing the data
                                                                sanitize=True, # whether we sanitize when preprocessing the data
                                                                keep_ui=True, # whether to keep the uninformative words or not when preprocessing the data
                                                                token_level=True) # whether to use token level similarity or not 
# save the similarity matrix
np.savetxt(f"{folder_name}/Output/embed_token_similarities.txt", similarity_matrix_embed_token)

### B. Document-level

For a venture and a judge, document-level takes the mean token embeddings for the venture and the judge embedding and finds the cosine similarity between the mean venture embedding and the mean judge embedding. 

In [None]:
# get the document level similarity matrix
similarity_matrix_embed, _, _ = compute_embedding_sim(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                model='bert-base-uncased',
                                                                stem=True, 
                                                                sanitize=True, 
                                                                keep_ui=True,
                                                                token_level=False)
# save the similarity matrix
np.savetxt(f"{folder_name}/Output/embed_similarities.txt", similarity_matrix_embed)

## Hybrid Base Learners

Get the embeddings, weights, attention masks, and judge/venture mappings necessary to compute the hybrid model similarity matrix.

In [None]:
from similarity.hybrid_sim import get_hybrid_embeddings
# get the embeddings, weights, masks the judge and ventures
judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, _, _ = get_hybrid_embeddings(
                                                                            judge_folder=judge_folder, # judge folder name
                                                                            venture_folder=venture_folder, # venture folder name
                                                                            wiki_folder=None, # input None or the wikipedia folder to supplement the idf
                                                                            model='bert-base-uncased', # the name of the model used 
                                                                            augmented_idf=False, # whether to use the augmented idf or use the vanilla idf
                                                                            stem=True, # whether we stem when preprocessing the data
                                                                            sanitize=True, # whether we sanitize when preprocessing the data
                                                                            keep_ui=True) # whether to keep the uninformative words or not

### A. Token-level

For a venture and a judge, we weigh the token-level simialrity between a token embedding from the judge and a token embedding from the venture by their corresponding IDF weights. Then we take the average across all weighted token similarities. 

In [None]:
from similarity.hybrid_sim import compute_hybrid_sim
# get the hybrid similarity matrix
similarity_matrix_hybrid_token = compute_hybrid_sim(
                                                    judge_embeddings, 
                                                    judge_idf_weights, 
                                                    judge_mask, 
                                                    venture_embeddings, 
                                                    venture_idf_weights, 
                                                    venture_mask, 
                                                    token_level=True)

# save the similarity matrix
np.savetxt(f"{folder_name}/Output/hybrid_token_similarities.txt", similarity_matrix_hybrid_token)

### B. Document-level
For a venture and a judge, we take a weighted average of the token embeddings for the judge or venture embedding. Then we calculate the cosine similarity between the weighted token embeddings. 

In [None]:
from similarity.hybrid_sim import compute_hybrid_sim
# get the hybrid similarity matrix
similarity_matrix_hybrid = compute_hybrid_sim(
                                                    judge_embeddings, 
                                                    judge_idf_weights, 
                                                    judge_mask, 
                                                    venture_embeddings, 
                                                    venture_idf_weights, 
                                                    venture_mask, 
                                                    token_level=False)

# save the similarity matrix
np.savetxt(f"{folder_name}/Output/hybrid_similarities.txt", similarity_matrix_hybrid)

# 2. Ensemble Learning

Now that we have the base learner similarity matrices, we can combine them into an ensemble learner through linear regression. First, we generate a random match quality dataset as the ground truth dataset. In actual evaluations, these match quality scores would be curated by human experts, assigning a score of 1 to 5 for each judge-venture pair, where 5 indicates an excellent match. 

In [None]:
from pathlib import Path
import numpy as np

folder_name = "example"

def get_random_manual_scores(folder_name: str) -> dict:
    '''
    args:
        folder_name: name of the data folder
    return:
        scores: a mapping of the judge-venture pairs to their randomly generated match quality score
    '''
    scores = {}
    judge_folder = f"{folder_name}/Input/Judge_Folder"
    venture_folder = f"{folder_name}/Input/Venture_Folder"

    j_folder_path = Path(judge_folder)
    v_folder_path = Path(venture_folder)
    n_judges = len([i for i in j_folder_path.glob("*.txt")])
    n_ventures = len([i for i in v_folder_path.glob("*.txt")])
    for i in range(n_judges):
        for j in range(n_ventures):
            scores[(i, j)] = np.random.randint(1,5)
    return scores

manual_scores = get_random_manual_scores(folder_name)

We use a linear regression with convex constraints to combine the similarity matrices from the base learners into one ensemble learner.

In [None]:
from ensemble import optimize_similarity, get_score_difference

from sklearn.model_selection import KFold
filenames = [f'{folder_name}/Output/embed_similarities.txt', 
             f'{folder_name}/Output/hybrid_similarities.txt', 
             f'{folder_name}/Output/tfidf_similarities.txt']
# get a random set of manual scores
manual_scores = get_random_manual_scores(folder_name)
# define k fold 
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
avg = []

score_keys = list(manual_scores.keys())
score_vals = list(manual_scores.values())
all_weights = []
# iterate through each fold
for i, (train_index, test_index) in enumerate(kfold.split(score_keys)):
    print("\nCross-validation", i)
    # get the train data and test data 
    train_score_dic = {
        score_keys[j]: score_vals[j] for j in train_index
    }
    test_score_dic = {
        score_keys[j]: score_vals[j] for j in test_index
    }

    # optimize over the algorithmic methods
    weights, X_train, y_train, train_mean_diff = optimize_similarity(filenames, train_score_dic)
    # get the test loss
    test_mean_diff = get_score_difference(filenames, test_score_dic, weights)
    all_weights.append(weights)
    print("Average Train Difference:", train_mean_diff)
    print('Average Test Difference:', test_mean_diff)

# get the average of weights from the 5 fold 
all_weights = np.array(all_weights)
avg_weights = np.mean(all_weights, axis=0)
diff = get_score_difference(filenames, manual_scores, avg_weights)
print("\nFinal Avg Weight Diff:", diff)

# 3. Kendall Tau-b Evaluation

We use the Kendall tau-b rank correlation to evaluate the predicted similarity scores against the ground-truth scores. This metric enables us to assess how well different models perform relative to each other. (In this tutorial, since we use a randomly generated dataset of match quality scores, the results do not reflect real-world performance. In practice, one would use match quality scores curated by human experts for a valid evaluation.)
* If $\tau=+1$, there's perfect positive monotonic association (as one variable's rank increases, the other's consistently increases).
* If $\tau=-1$, there's perfect negative monotonic association (as one variable's rank increases, the other's consistently decreases).
* If $\tau=0$, there's no monotonic association between the variables.

We can find the kendall tau-b for an single base learner.

In [None]:
from evaluate import get_percentile, get_ranking
# load the similarity matrix
sim_mat = np.loadtxt(f"{folder_name}/Output/embed_similarities.txt")
# define output path
output_path = f"{folder_name}/Output/embed_kendall_tau.txt"
# get the percentile distribution of scores of the manual score
percentiles = get_percentile(manual_scores)
# get the kendall tau-b rank coefficients between the manual scores and algorithmic scores and the detailed 
score_ranks = get_ranking(sim_mat, manual_scores, percentiles, output_path)
with open(output_path, 'r') as f:
    print(f.read())

We can find the kendall tau-b for the ensemble learner.

In [None]:
from ensemble import get_input_and_labels
avg_weights = np.mean(all_weights, axis=0)
filenames = [f'{folder_name}/Output/embed_similarities.txt', 
             f'{folder_name}/Output/hybrid_similarities.txt', 
             f'{folder_name}/Output/tfidf_similarities.txt']

X, _, _ = get_input_and_labels(filenames, manual_scores)
ensemble_similarity = np.dot(weights, X.T)

output_path = f"{folder_name}/Output/ensemble_kendall_tau.txt"
# get the percentile distribution of scores of the manual score
percentiles = get_percentile(manual_scores)
# get the kendall tau-b rank coefficients between the manual scores and algorithmic scores
score_ranks = get_ranking(ensemble_similarity, manual_scores, percentiles, output_path)
with open(output_path, 'r') as f:
    print(f.read())