# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from gensim.models import KeyedVectors
import visualization as viz
import metrics as met
from ast import literal_eval
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# Load data

In [None]:
proc_data = pd.read_csv("data/processed_data.csv")
proc_data

In [None]:
clean_data = pd.read_csv("data/clean_data.csv")
clean_data

# Remove users with under a certain number of responses
- there must be at least 1 response to each category used in the model

In [None]:
indices_to_remove = []
cat_response_thresh = 1
cols_to_include = ["Experience", "Skills", "Interests", "Objectives", "Stages"]
for idx, row in proc_data.iterrows():
    for col in cols_to_include:
        if isinstance(row[col],str) and row[col] != "NaN" and row[col] != "nan":
            n_responses = len(literal_eval(row[col]))
            if n_responses < cat_response_thresh:
                indices_to_remove.append(idx)
        else:
            indices_to_remove.append(idx)
proc_data = proc_data.drop(index=indices_to_remove).reset_index(drop=True)
proc_data

# Word embedding
- Google News Word2Vec
- trained on about 100 billion words from Google News
- contains 3 million words and phrases

In [None]:
model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
user = "Jacob Sheldon"
responses = []
cols_to_include = ["Experience", "Skills", "Stages", "Interests", "Objectives"]
for idx, row in proc_data[proc_data["Name"]==user].iterrows():
    for col in cols_to_include:
        try:
            if isinstance(row[col],str) and row[col] != "NaN":
                responses.extend(literal_eval(row[col]))
        except:
            user = row["Name"]
            print(f"error in row {idx}, column {col}, user {user}")

embedding = []
for word in responses:
    try:
        embedding.append(model[word])
    except:
        print(f"{word} not found in model")

fig = viz.plot_embedding(embedding,responses)

In [None]:
terms_not_in_model = []
user_embeddings = []
all_users = proc_data["Name"].tolist()
users = proc_data["Name"].tolist()
ave_user_embedding = []
cols_to_include = ["Experience", "Skills", "Interests", "Objectives", "Stages"]
for idx, row in proc_data.iterrows():
    # get all responses
    responses = []
    for col in cols_to_include:
        try:
            if isinstance(row[col],str) and row[col] != "NaN":
                responses.extend(literal_eval(row[col]))
        except:
            user = row["Name"]
            print(f"error in row {idx}, column {col}, user {user}")
    
    if len(responses) == 0:
        users.remove(row["Name"])
        continue
    
    # embedding for each word
    embedding = []
    for resp in responses:
        try:
            embedding.append(model[resp])
        except:
            if resp not in terms_not_in_model:
                print(f"{resp} not found in model")
                terms_not_in_model.append(resp)
            else:
                pass
    # fill in nans with zeros
    embedding = [np.nan_to_num(word_embed,copy=True,nan=0.0) for word_embed in embedding]
    
    user_embeddings.append(embedding)
    
    # average
    ave_user_embedding.append(np.nanmean(np.array(embedding),axis=0))

In [None]:
fig = viz.plot_embedding(ave_user_embedding,users,include_labels=False)

In [None]:
# fig = viz.plot_embedding(np.array(ave_user_embedding),users,include_labels=False,dim="3d")

# Similarity rankings

In [None]:
cos_similarity, similarity_scores = met.compute_similarity_rankings(ave_user_embedding, users)

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(cos_similarity,cmap="Blues",xticklabels=users,yticklabels=users)
plt.title("Cosine similarity")
plt.show()

In [None]:
sorted_recs = similarity_scores.sort_values("Cosine Similarity",ascending=False)
sorted_recs[:20]

# Visualize top matches across users

In [None]:
rand_users = np.random.choice(users,size=10,replace=False)
columns = ["Experience", "Skills", "Interests", "Objectives", "Stages"]
with PdfPages("recommendations/GoogleWord2Vec_recommendations.pdf") as pdf:
    for user in rand_users:
        # get top rec
        top_rec_df = similarity_scores[similarity_scores["User1"]==user].sort_values("Ranking",ascending=True)

        # shared features
        rec = top_rec_df["User2"].tolist()[0]
        score = top_rec_df["Cosine Similarity"].tolist()[0]
        fig = viz.compare_users(user, rec, columns, score)
        pdf.savefig(fig)
        
        # locations in embed space
        fig = viz.plot_embedding(
            ave_user_embedding,
            [u if u in [user,rec] else "" for u in users],
            highlight_labels=True,
            xlims=(-0.25,0.5),
            ylims=(-0.5,0.5),
            figsize=(12,9),
            dim="3d"
        )
        pdf.savefig(fig)

# Visualize matches for one user

In [None]:
user = "Jacob Sheldon"
columns = ["Experience", "Skills", "Interests", "Objectives"]
rec_df = similarity_scores[similarity_scores["User1"]==user].sort_values("Ranking",ascending=True)
with PdfPages(f"recommendations/GoogleWord2Vec_recommendations_{user}.pdf") as pdf:
    for idx,row in rec_df[:5].iterrows():
        # get rec
        rec = row["User2"]
        score = row["Cosine Similarity"]

        # compare features
        fig = viz.compare_users(user, rec, columns, score)
        pdf.savefig(fig)
        
        # locations in embed space
        fig = viz.plot_embedding(
            ave_user_embedding,
            [u if u in [user,rec] else "" for u in users],
            highlight_labels=True,
            figsize=(12,9),
            dim="3d"
        )
        pdf.savefig(fig)

# Correlation between similarity metrics

In [None]:
similarity_scores["Jaccard Similarity"] = [0]*len(similarity_scores)
for idx,row in similarity_scores.iterrows():
    similarity_scores.loc[idx,"Jaccard Similarity"] = met.jaccard_similarity(row["User1"],row["User2"])

In [None]:
plt.figure(figsize=(9,6))
sns.scatterplot(data=similarity_scores,x="Cosine Similarity",y="Jaccard Similarity")
similarity_metrics = similarity_scores.drop(columns=["User1","User2","Ranking"])
corr = np.round(similarity_metrics.corr().to_numpy()[0][1],2)
plt.title(f"Correlation -- {corr}")
plt.show()

# Normalize cosine similarity scores

In [None]:
norm_similarity_scores = similarity_scores.copy()
cos_sim_scores = norm_similarity_scores["Cosine Similarity"].tolist()
norm_similarity_scores["Normalized Cosine Similarity"] = MinMaxScaler((0,max(cos_sim_scores))).fit_transform(np.array(cos_sim_scores).reshape(-1,1))
norm_similarity_scores

In [None]:
norm_similarity_scores.plot(x="Cosine Similarity",y="Normalized Cosine Similarity")
plt.ylabel("Normalized Cosine Similarity")
plt.show()

# Get top 20 pairs of unique people

In [None]:
pairs = []
sorted_similarity_scores = norm_similarity_scores.sort_values("Ranking").sort_values("Normalized Cosine Similarity",ascending=False)
user_pairs = [(row["User1"],row["User2"],row["Normalized Cosine Similarity"]) for idx,row in sorted_similarity_scores.iterrows()]
for u in user_pairs:
    if len(pairs) == 20:
        break
    if np.any([u[0] in p or u[1] in p for p in pairs]):
        continue
    pairs.append((u[0],u[1],u[2]))
pairs

In [None]:
columns = ["Experience", "Skills", "Interests", "Objectives", "Stages"]
with PdfPages("recommendations/GoogleWord2Vec_recommendations.pdf") as pdf:
    for pair in pairs:
        user = pair[0]
        rec = pair[1]
        score = pair[2]
        
        # shared features
        fig = viz.compare_users(user, rec, columns, score)
        pdf.savefig(fig)
        
        # locations in embed space
        fig = viz.plot_embedding(
            ave_user_embedding,
            [u if u in [user,rec] else "" for u in users],
            highlight_labels=True,
            xlims=(-0.25,0.5),
            ylims=(-0.5,0.5),
            figsize=(12,9),
            dim="3d"
        )
        pdf.savefig(fig)

In [None]:
all_user_pairs = np.hstack([[p[0],p[1]] for p in pairs])
print("All unique users in chosen pairs:",len(all_user_pairs)==len(set(all_user_pairs)))