#  1. Get similarity Matrix

run the following cell to specify what data to use

In [None]:
import os
import numpy as np

folder_name = "example"
judge_folder = f"{folder_name}/Input/Judge_Folder"
venture_folder = f"{folder_name}/Input/Venture_Folder"

os.makedirs(f"{folder_name}/Output", exist_ok=True)

def print_max_pair(matrix, judge_to_ind, venture_to_ind):
    judge_i, venture_i = np.unravel_index(np.argmax(matrix), matrix.shape)
    max_sim_score = np.max(matrix)
    ind_to_judge = {ind:judge for judge, ind in judge_to_ind.items()}
    ind_to_venture = {ind:venture for venture, ind in venture_to_ind.items()}
    print('Similarity score:', max_sim_score)
    with open(f'{folder_name}/Input/Judge_Folder/{ind_to_judge[judge_i]}.txt', 'r') as f:
        print(f'Judge {ind_to_judge[judge_i]}.txt, {judge_i}')
        print(f.read())
    with open(f'{folder_name}/Input/Venture_Folder/{ind_to_venture[venture_i]}.txt', 'r') as f:
        print(f'Venture {ind_to_venture[venture_i]}.txt, {venture_i}')
        print(f.read())

## Using TFIDF
### 1. Use augmented TFIDF

In [None]:
from similarity.tfidf_sim import get_aug_tfidf, tfidf_sim_aug
from preprocess_data import get_parsed_data

def get_tfidf_sim_aug(judge_folder, venture_folder, wiki_folder, stem, sanitize, lines, keep_ui):
    judges, ventures = get_parsed_data(judge_folder, venture_folder, stem, sanitize, lines, keep_ui)
    judge_to_ind = {judge:ind for ind, judge in enumerate(judges.keys())}
    venture_to_ind = {venture:ind for ind, venture in enumerate(ventures.keys())}
    idf, _, = get_aug_tfidf([judges, ventures], wiki_folder, stem, sanitize, lines, keep_ui)
    similarity_matrix = -np.ones((len(judges), len(ventures)))

    for judge, j_ind in judge_to_ind.items():
        for venture, v_ind in venture_to_ind.items():
            j_counter, v_counter = judges[judge], ventures[str(venture)]

            if len(j_counter) != 0 and len(v_counter) != 0:
                similarity_matrix[j_ind, v_ind] = tfidf_sim_aug(
                    j_counter, v_counter, idf
                )
    return similarity_matrix, judge_to_ind, venture_to_ind

similarity_matrix_tfidf_aug, judge_to_ind, venture_to_ind = get_tfidf_sim_aug(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                wiki_folder=None, 
                                                                stem=True, 
                                                                sanitize=True, 
                                                                lines=False, 
                                                                keep_ui=False)
np.savetxt(f"{folder_name}/Output/tfidf_aug_similarities.txt", similarity_matrix_tfidf_aug)



In [None]:
print_max_pair(similarity_matrix_tfidf_aug, judge_to_ind, venture_to_ind)

### 2. Use vanilla TFIDF with IDF smoothing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel
from preprocess_data import parse_info, get_parsed_data

wiki_folder = 'wikipedia_files'
stem = True
sanitize = True
lines = True
keep_ui = False

def get_tfidf_sim(judge_folder, venture_folder, wiki_folder, stem, sanitize, lines, keep_ui):
    count_vectorizer = CountVectorizer(stop_words="english")
    tfidf = TfidfTransformer(
        smooth_idf=True, use_idf=True, norm="l2", sublinear_tf=False
    )

    judges, ventures = get_parsed_data(judge_folder, venture_folder, stem, sanitize, lines, keep_ui)
    judge_to_ind = {judge:ind for ind, judge in enumerate(judges.keys())}
    venture_to_ind = {venture:ind for ind, venture in enumerate(ventures.keys())}

    judge_values = list(judges.values())
    venture_values = list(ventures.values())

    if wiki_folder is not None:
        wiki_dic = parse_info(wiki_folder, stem, sanitize, lines, keep_ui)
        wiki_values = list(wiki_dic.values())
        word_count_vec = count_vectorizer.fit_transform(
            judge_values + venture_values + wiki_values
        )
    else:
        word_count_vec = count_vectorizer.fit_transform(judge_values + venture_values)

    tfidf.fit(word_count_vec)
    count_vec = count_vectorizer.transform(judge_values + venture_values)
    tfidf_vec = tfidf.transform(count_vec)
    similarity_matrix = linear_kernel(tfidf_vec, tfidf_vec)[
        : len(judge_values), len(judge_values) :
    ]
    return similarity_matrix, judge_to_ind, venture_to_ind 

similarity_matrix_tfidf, judge_to_ind, venture_to_ind = get_tfidf_sim(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                wiki_folder=None, 
                                                                stem=True, 
                                                                sanitize=True, 
                                                                lines=True, 
                                                                keep_ui=False)
np.savetxt(f"{folder_name}/Output/tfidf_similarities.txt", similarity_matrix_tfidf)


In [None]:
print_max_pair(similarity_matrix_tfidf, judge_to_ind, venture_to_ind)

## Using Embeddings

In [None]:
import time
import pandas as pd
from similarity.embed_sim import get_tokenizer_and_model, tokenize, token_similarity
from preprocess_data import get_parsed_data
from sklearn.metrics.pairwise import cosine_similarity


def get_embedding_sim(judge_folder, venture_folder, model, stem, sanitize, lines, keep_ui, token_level):
    judges, ventures = get_parsed_data(judge_folder, venture_folder, stem, sanitize, lines, keep_ui)
    judge_to_ind = {judge:ind for ind, judge in enumerate(judges.keys())}
    venture_to_ind = {venture:ind for ind, venture in enumerate(ventures.keys())}

    judge_rows = [(judge_to_ind[judge], judges[judge]) for judge in judge_to_ind]
    judge_df = pd.DataFrame(judge_rows, columns=["index", "info"])
    venture_rows = [(venture_to_ind[venture], ventures[venture]) for venture in venture_to_ind]
    venture_df = pd.DataFrame(venture_rows, columns=["index", "info"])

    tokenizer, pretrained_model = get_tokenizer_and_model(model)
    judge_embeddings, mean_judge_embed, judge_mask = tokenize(judge_df, model, tokenizer, pretrained_model)
    venture_embeddings, mean_venture_embed, venture_mask = tokenize(venture_df, model, tokenizer, pretrained_model)
    start = time.time()
    if not token_level:
        similarity_matrix = cosine_similarity(mean_judge_embed.numpy(), mean_venture_embed.numpy())
    else:
        similarity_matrix = token_similarity(judge_embeddings, judge_mask, venture_embeddings, venture_mask)
    end = time.time()
    print(f'Calculating similarity matrix took: {round(end-start, 2)} seconds')
    return similarity_matrix, judge_to_ind, venture_to_ind

similarity_matrix_embed_token, judge_to_ind, venture_to_ind = get_embedding_sim(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                model='bert-base-uncased',
                                                                stem=True, 
                                                                sanitize=True, 
                                                                lines=True, 
                                                                keep_ui=True,
                                                                token_level=True)
np.savetxt(f"{folder_name}/Output/embed_token_similarities.txt", similarity_matrix_embed_token)
similarity_matrix_embed, judge_to_ind, venture_to_ind = get_embedding_sim(judge_folder=judge_folder, 
                                                                venture_folder=venture_folder, 
                                                                model='bert-base-uncased',
                                                                stem=True, 
                                                                sanitize=True, 
                                                                lines=True, 
                                                                keep_ui=True,
                                                                token_level=False)
np.savetxt(f"{folder_name}/Output/embed_similarities.txt", similarity_matrix_embed)

In [None]:
print_max_pair(np.array(similarity_matrix_embed), judge_to_ind, venture_to_ind)

In [None]:
print_max_pair(np.array(similarity_matrix_embed_token), judge_to_ind, venture_to_ind)

## Using Hybrid Method

In [None]:
import pandas as pd
import numpy as np
from similarity.tfidf_sim import get_aug_tfidf, get_smoothed_tfidf
from similarity.hybrid_sim import tokenize_hybrid, get_hybrid_sim
from similarity.embed_sim import get_tokenizer_and_model
from preprocess_data import get_parsed_data


def get_hybrid_embeddings(judge_folder, venture_folder, wiki_folder, model, augmented_idf, stem, sanitize, lines, keep_ui):
    judges, ventures = get_parsed_data(judge_folder, venture_folder, stem=True, sanitize=True, lines=False, keep_ui=False)
    judge_to_ind = {judge:ind for ind, judge in enumerate(judges.keys())}
    venture_to_ind = {venture:ind for ind, venture in enumerate(ventures.keys())}

    if augmented_idf:
        idf, _ = get_aug_tfidf([judges, ventures], wiki_folder, stem, sanitize, lines=False, keep_ui=keep_ui)
    else:
        idf = get_smoothed_tfidf(judge_folder, venture_folder, wiki_folder, stem, sanitize, lines=True, keep_ui=keep_ui)

    judge_line, venture_line = get_parsed_data(judge_folder, venture_folder, stem, sanitize, lines, keep_ui)
    judge_rows = [(judge_to_ind[judge], judge_line[judge]) for judge in judge_to_ind]
    judge_df = pd.DataFrame(judge_rows, columns=["index", "info"])
    venture_rows = [(venture_to_ind[venture], venture_line[venture]) for venture in venture_to_ind]
    venture_df = pd.DataFrame(venture_rows, columns=["index", "info"])

    tokenizer, pretrained_model = get_tokenizer_and_model(model)

    judge_embeddings, judge_idf_weights, judge_mask = tokenize_hybrid(
        judge_df, model, tokenizer, pretrained_model, idf)
    
    venture_embeddings, venture_idf_weights, venture_mask= tokenize_hybrid(
        venture_df, model, tokenizer, pretrained_model, idf)

    return judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, judge_to_ind, venture_to_ind


In [None]:
judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, judge_to_ind, venture_to_ind = get_hybrid_embeddings(
                                                                            judge_folder=judge_folder, 
                                                                            venture_folder=venture_folder, 
                                                                            wiki_folder=None, 
                                                                            model='bert-base-uncased', 
                                                                            augmented_idf=False, 
                                                                            stem=True, 
                                                                            sanitize=True, 
                                                                            lines=True, 
                                                                            keep_ui=True)
            
similarity_matrix_token_kernel = get_hybrid_sim(judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, token_level=True, linear_kernel=True)
similarity_matrix_token = get_hybrid_sim(judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, token_level=True, linear_kernel=False)
similarity_matrix = get_hybrid_sim(judge_embeddings, judge_idf_weights, judge_mask, venture_embeddings, venture_idf_weights, venture_mask, token_level=False, linear_kernel=False)

np.savetxt(f"{folder_name}/Output/hybrid_similarities.txt", similarity_matrix)
np.savetxt(f"{folder_name}/Output/hybrid_token_similarities.txt", similarity_matrix_token)
np.savetxt(f"{folder_name}/Output/hybrid_token_kernel_similarities.txt", similarity_matrix_token_kernel)

In [None]:
print_max_pair(similarity_matrix_token_kernel, judge_to_ind, venture_to_ind)

In [None]:
print_max_pair(similarity_matrix_token, judge_to_ind, venture_to_ind)

In [None]:
print_max_pair(similarity_matrix, judge_to_ind, venture_to_ind)

# 2. Get Evaluations
## Example Use Case
here we randomly generate a manual score dic and perform the evaluations. For actual evaluations, the manual score dic contains score that are manually curated by expert humans on the match quality of judge venture pairs. 

In [None]:
from pathlib import Path
import numpy as np
def get_random_manual_scores(folder_name):
    manual_scores = {}
    judge_folder = f"{folder_name}/Input/Judge_Folder"
    venture_folder = f"{folder_name}/Input/Venture_Folder"

    j_folder_path = Path(judge_folder)
    v_folder_path = Path(venture_folder)
    n_judges = len([i for i in j_folder_path.glob("*.txt")])
    n_ventures = len([i for i in v_folder_path.glob("*.txt")])
    for i in range(n_judges):
        for j in range(n_ventures):
            manual_scores[(i, j)] = np.random.randint(1,5)
    return manual_scores

manual_scores = get_random_manual_scores('example')

In [None]:
from evaluate import get_percentile, get_scoremat_from_sim, get_ranking

sim_mat = np.loadtxt("example/Output/embed_similarities.txt")
output_path = "example/Output/embed_kendall_tau.txt"
percentiles = get_percentile(manual_scores)
scoremat_from_sim = get_scoremat_from_sim(sim_mat, percentiles)
score_ranks = get_ranking(sim_mat, manual_scores, percentiles, scoremat_from_sim, output_path)
with open(output_path, 'r') as f:
    print(f.read())

# 3. Ensemble Learning Optimization

In [None]:
from ensemble import optimize_similarity, get_score_difference

from sklearn.model_selection import KFold
folder_name = 'example'
filenames = ['example/Output/embed_similarities.txt', 
             'example/Output/hybrid_similarities.txt', 
             'example/Output/tfidf_similarities.txt']
manual_scores = get_random_manual_scores(folder_name)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
avg = []

score_keys = list(manual_scores.keys())
score_vals = list(manual_scores.values())
all_weights = []
for i, (train_index, test_index) in enumerate(kfold.split(score_keys)):
    print("\nCross-validation", i)
    train_score_dic = {
        score_keys[j]: score_vals[j] for j in train_index
    }
    test_score_dic = {
        score_keys[j]: score_vals[j] for j in test_index
    }

    weights, X_train, y_train, train_mean_diff = optimize_similarity(filenames, train_score_dic)
    test_mean_diff = get_score_difference(filenames, test_score_dic, weights)
    all_weights.append(weights)
    print("Average Train Difference:", train_mean_diff)
    print('Average Test Difference:', test_mean_diff)

all_weights = np.array(all_weights)
avg_weights = np.mean(all_weights, axis=0)
diff = get_score_difference(filenames, manual_scores, avg_weights)
print("\nFinal Avg Weight Diff:", diff)