# Imports

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import visualization as viz
from nltk import FreqDist

# Load data

In [None]:
clean_data = pd.read_csv("data/clean_data.csv")
clean_data

In [None]:
clean_data = clean_data.astype(str)

# Convert to one-hot vector

In [None]:
all_keywords = []
user_keywords = {}
cols_to_include = ["Experience","Skills","Interests","Stages","Objectives","Looking for"]
for idx,row in clean_data.iterrows():
    user_keywords[row["Name"]] = []
    for col in cols_to_include:
        if row[col] != "nan":
            all_keywords.extend(literal_eval(row[col]))
            user_keywords[row["Name"]].extend(literal_eval(row[col]))
all_uniq_keywords = np.unique(all_keywords)

In [None]:
onehot_arr = np.zeros((len(user_keywords),len(all_uniq_keywords)))
users = list(user_keywords.keys())
for i,user in enumerate(user_keywords):
    for j,keyword in enumerate(all_uniq_keywords):
        onehot_arr[i,j] = int(keyword in user_keywords[user])
onehot_df = pd.DataFrame(onehot_arr,columns=all_uniq_keywords,index=users)
onehot_df

In [None]:
fig = viz.plot_embedding(onehot_df.to_numpy(),labels=users,include_labels=False)

# Measure cosine distances between users

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(onehot_df,cmap="Blues")
plt.title("Keyword Responses")
plt.show()

In [None]:
cos_similarity = cosine_similarity(onehot_df.to_numpy())

plt.figure(figsize=(15,12))
sns.heatmap(cos_similarity,cmap="Blues",xticklabels=users,yticklabels=users)
plt.title("Cosine similarity")
plt.show()

# Rank similarity

In [None]:
def similarity_rankings(users, similarity_scores, num_recs: int=5) -> pd.DataFrame:
    user_recs_dict = {"user":[],"recommendation":[],"score":[], "ranking":[]}
    for i,user in enumerate(users):
        # get sim scores and remove current user
        curr_sim_scores = np.delete(similarity_scores[i].copy(),i)
        curr_users = np.delete(np.array(users.copy()),i)
        # get recs
        for i in range(num_recs):
            user_recs_dict["user"].append(user)
            # current most similar user and score
            top_sim_idx = np.argmax(curr_sim_scores)
            top_sim_user = curr_users[top_sim_idx]
            top_sim_score = curr_sim_scores[top_sim_idx]
            user_recs_dict["recommendation"].append(top_sim_user)
            user_recs_dict["score"].append(top_sim_score)
            user_recs_dict["ranking"].append(i+1)
            # remove from lists
            curr_sim_scores = np.delete(curr_sim_scores,top_sim_idx)
            curr_users = np.delete(curr_users,top_sim_idx)

    user_recs = pd.DataFrame.from_dict(user_recs_dict)
    return user_recs

In [None]:
cos_user_recs = similarity_rankings(users, cos_similarity)
rand_users = np.random.choice(users,size=8,replace=False)
fig,axes = plt.subplots(4,2,figsize=(15,10),sharey=True)
for i,(user,ax) in enumerate(zip(rand_users,np.ravel(axes))):
    curr_user = cos_user_recs[cos_user_recs["user"]==user]
    ax.bar(curr_user.recommendation,curr_user.score)
    ax.set_title(user)
    ax.set_ylabel("cosine similarity") if not i%2 else ax.set_ylabel("")
plt.tight_layout()
plt.show()

# Visualize highest ranked matches

In [None]:
top_pairs = []
sorted_recs = cos_user_recs.sort_values("score",ascending=False)
sorted_recs[:20]

In [None]:
columns = ["Experience","Skills","Stages","Objectives","Interests"]
for idx,row in sorted_recs[:5].iterrows():
    fig = viz.compare_users(row["user"], row["recommendation"], columns)