In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from detoxify import Detoxify

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
toxic_model = Detoxify("original")

In [None]:
df = pd.read_csv("data/okcupid_profiles.csv")

In [None]:
# Create a biography from all essays except essay9

df['bio'] = df[['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8']].apply(lambda x: ' '.join(x), axis=1)
df = df[df.columns.drop(list(df.filter(regex="essay")))]

In [None]:
# Identify problematic users and filter them out of the data

def apply_detoxify(bios):
    return toxic_model.predict(bios)

toxic_scores = pd.DataFrame([apply_detoxify(text) for text in df['bio']], index = df.index)

df.drop(toxic_scores[(toxic_scores["severe_toxicity"] > 0.1) | (toxic_scores["threat"] > 0.01)].index, inplace=True)

In [None]:
bios = df['bio'].tolist()

# Encode bios using model
embedding_list = model.encode(bios)
embedding_list = embedding_list.tolist()

# Save embeddings and index to csv
embedding_series = pd.Series(embedding_list, index=df.index, name="embedding")
embedding_series.to_csv('embedding_series.csv')

In [None]:
# Create preferred gender column using sexual orientation and gender identity for matching

df['pref_gen'] = np.where((df[ 'sex'] == 'm') & (df['orientation'] == 'gay'), 'm', 
                        np.where((df['sex'] == 'm') & (df['orientation'] == 'straight'), 'f',
                        np.where((df['sex'] == 'f') & (df['orientation'] == 'straight'), 'm',
                        np.where((df['sex'] == 'f') & (df['orientation'] == 'gay'), 'f',
                        np.where((df['orientation'] == 'bisexual'), 'all', 'no one')))))

In [None]:
def compute_cosine_similarity(target_vector, vectors):
    similarities = []
    for vector in vectors:
        similarity = 1 - cosine(target_vector, vector)
        similarities.append(similarity)
    return similarities

In [None]:
## Function to generate and handle matches among pre-existing users

def rank_matches(input_row, pref_age_lower=False, pref_age_higher=False, min_similarity_score = 0.65):
    df_possible = df.copy()
    if pref_age_higher:
        df_possible = df_possible[df_possible['age'] <= df_possible.loc[input_row, 'age'] + pref_age_higher]
    if pref_age_lower:
        df_possible = df_possible[df_possible["age"] >= df_possible.loc[input_row,'age'] - pref_age_lower]
    if df_possible.loc[input_row,'pref_gen'] != 'all':
        df_possible = df_possible.loc[df_possible.loc[input_row,'pref_gen'] == df_possible.loc[:,'sex'],:]

    user_embeddings = embedding_series[input_row]

    other_embeddings = [embedding_series[i] for i in df_possible.index]
    cosine_similarities = compute_cosine_similarity(user_embeddings, other_embeddings)
    similarity_scores = [(df_possible.index[index], score) for index, score in enumerate(cosine_similarities) if score >= min_similarity_score and score != 1]
    ranked_similarity = sorted(similarity_scores, key = lambda x: x[1], reverse = True)

    return ranked_similarity


In [None]:
df['matches'] = [rank_matches(i,10,10) for i, row in df.iterrows()]
df['matches'].to_csv('okcupid_matches.csv') 

In [None]:
## Function to handle user input to find match among pre-existing users

def rank_new_input(input_str, pref_gender=False, pref_age_lower=False, pref_age_higher=False, min_similarity_score = 0.5):
    
    toxicity_rubric = toxic_model.predict(input_str)
    
    # Only compute and display matches if bio passes toxicity test
    if toxicity_rubric['severe_toxicity'] <= .1 or toxicity_rubric['threat'] <= .01:
        df_possible = df.copy()
        # Perform filtering on users preference
        if pref_gender:
            df_possible = df_possible.loc[df_possible.loc[:,'sex'] == pref_gender, :]
        if pref_age_higher:
            df_possible = df_possible[df_possible.loc[:, "age"] <= pref_age_higher]
        if pref_age_lower:
            df_possible = df_possible[df_possible.loc[:, "age"] >= pref_age_lower]
        user_embeddings = model.encode(input_str)
        other_embeddings = [embedding_series[i] for i in df_possible.index]
        # Compute and order similarities
        cosine_similarities = compute_cosine_similarity(user_embeddings, other_embeddings)
        similarity_scores = [(df_possible.index[index], score) for index, score in enumerate(cosine_similarities) if score >= min_similarity_score and score != 1]
        ranked_similarity = sorted(similarity_scores, key = lambda x: x[1], reverse = True)
        return ranked_similarity
    
    else:
        return "Your matches cannot be shown due to harmful material in your bio. Please modify and try again."