In [84]:
import os
import psycopg2
import psycopg2.extras
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer



# https://stackoverflow.com/questions/43515877/should-binary-features-be-one-hot-encoded


# Database connection setup
def get_db_connection():
    connection = psycopg2.connect(
        database="postgres",
        user="postgres",
        password="HeliosChenThomasBuckley",
        host="db.fxxqwotagugztamftphi.supabase.co",
        port="5432",
    )
    
    print('connected')
    return connection



In [85]:
def fetch_profiles():
    conn = get_db_connection()
    with conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor) as cur:
        cur.execute('SELECT * FROM "Profile" ORDER BY id LIMIT 100;')
        profiles = cur.fetchall()
    conn.close()
    return profiles



In [86]:
def custom_similarity(features, likes_length, dislikes_length):
    n = features.shape[0]
    similarity_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(n):
            if i != j:
                # Indices for likes and dislikes
                likes_i = features[i, :likes_length]
                dislikes_i = features[i, likes_length:likes_length + dislikes_length]
                likes_j = features[j, :likes_length]
                dislikes_j = features[j, likes_length:likes_length + dislikes_length]

                # Positive contribution (standard cosine similarity)
                positive_similarity = cosine_similarity([features[i]], [features[j]])[0, 0]

                # Negative contribution (conflicting likes and dislikes)
                negative_similarity = np.sum((likes_i * dislikes_j) + (dislikes_i * likes_j))

                # Subtracting the negative contribution from the positive
                similarity_matrix[i, j] = positive_similarity - negative_similarity

    return similarity_matrix


In [87]:
df = pd.DataFrame(fetch_profiles())

df.drop(columns=['essay0', 'essay1','essay2','essay3','essay4','essay5','essay6','essay7','essay8', 'essay9', 'bio'], inplace=True)


categorical_columns = ['body_type', 'education', 'pets', 'diet', 'offspring', 'job']

# One-hot encoding for categorical data with multiple value possibilities
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(df[categorical_columns])

# Binary encoding for the binary data (smokes, drinks, religion)
df['smokes'] = (df['smokes'] == 'yes').astype(int)
df['drinks'] = (df['drinks'] == 'yes').astype(int)
df['religion'] = (df['religion'] == 'religious').astype(int)

# encoding the likes and dislikes colunns
mlb = MultiLabelBinarizer()
likes_encoded = mlb.fit_transform(df['likes'])
likes_columns = mlb.classes_
dislikes_encoded = mlb.fit_transform(df['dislikes'])
dislikes_columns = mlb.classes_
#prefix these columns to distinguish between likes and dislikes
likes_columns = ['like_' + col for col in likes_columns]
dislikes_columns = ['dislike_' + col for col in dislikes_columns]

# print(likes_columns)


# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(df)


connected


In [88]:
# Combine the binary encoded and the one hot encoded values into one. 
combined_features = np.hstack((encoded_data, df[['smokes', 'drinks', 'religion']].values))
final_features = np.hstack((combined_features, likes_encoded, dislikes_encoded))


likes_length = len(likes_encoded[0])
dislikes_length = len(dislikes_encoded[0])

custom_similarity_matrix = custom_similarity(final_features, likes_length, dislikes_length)

# Convert to DataFrame for better readability
similarity_df = pd.DataFrame(custom_similarity_matrix, index=df.index, columns=df.index)


similarity_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.000000,-2.705882,-3.599837,-1.666151,-1.714169,-3.647059,-4.771335,-1.647059,-2.721793,-2.764706,...,-2.666151,-1.714169,-2.657003,-1.714169,-2.757464,-4.647059,-3.764706,-2.588235,-3.823529,-1.714169
1,-2.705882,0.000000,-0.657003,-0.721793,0.285831,0.352941,-1.714169,-0.588235,-3.833076,0.235294,...,0.278207,0.342997,0.342997,-1.771335,0.242536,-0.647059,-2.705882,-0.823529,-2.823529,-1.828501
2,-3.599837,-0.657003,0.000000,-1.621483,-1.777778,-0.599837,-3.666667,-1.714169,-1.675557,-0.714169,...,-1.675557,0.333333,0.444444,-0.666667,-1.705372,-2.771335,-1.657003,-1.771335,-3.771335,-0.777778
3,-1.666151,-0.721793,-1.621483,0.000000,-0.729631,-1.666151,-3.621483,-2.610510,-2.684211,-1.666151,...,-2.631579,-0.621483,-0.675557,0.270369,-0.713230,-1.721793,-2.721793,-0.721793,-5.721793,-1.675557
4,-1.714169,0.285831,-1.777778,-0.729631,0.000000,0.342997,-1.666667,-1.657003,-0.675557,0.400163,...,0.378517,0.444444,0.333333,0.277778,0.176777,-0.771335,-0.714169,-1.771335,-0.657003,-0.777778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.647059,-0.647059,-2.771335,-1.721793,-0.771335,-0.588235,-2.657003,-1.705882,-1.833076,-1.705882,...,-1.721793,-1.771335,-2.657003,-4.828501,-1.696830,0.000000,-3.705882,-2.823529,-1.705882,-0.828501
96,-3.764706,-2.705882,-1.657003,-2.721793,-0.714169,-2.647059,-2.714169,-1.705882,-2.833076,-0.705882,...,-0.666151,-0.657003,-1.657003,-2.828501,-3.818098,-3.705882,0.000000,-1.705882,-1.588235,-1.828501
97,-2.588235,-0.823529,-1.771335,-0.721793,-1.771335,-1.764706,-0.885668,-1.764706,-3.777434,-2.823529,...,-2.721793,-0.714169,-1.714169,-3.771335,-2.878732,-2.823529,-1.705882,0.000000,-2.823529,-1.828501
98,-3.823529,-2.823529,-3.771335,-5.721793,-0.657003,-2.705882,-1.657003,-1.764706,-3.777434,-0.764706,...,-1.610510,-4.771335,-2.714169,-1.885668,-1.878732,-1.705882,-1.588235,-2.823529,0.000000,-0.771335


In [89]:
def recommend_for_user(similarity_df, user_index, top_n=5):
    # Get similarity scores for the user
    user_similarity_scores = similarity_df[user_index]

    # Sort the scores and fetch top N indices
    top_users_indices = user_similarity_scores.sort_values(ascending=False).index[1:top_n+1]

    # Fetch the top recommended users/items
    return df.loc[top_users_indices]

recommendations = recommend_for_user(similarity_df,35, top_n=5)
initial = pd.DataFrame(df.loc[35])
initial.style




Unnamed: 0,35
id,29328
user_id,37
smokes,0
orientation,straight
body_type,large
diet,anything
drinks,1
education,university
height,62.0
job,Healthcare & Legal


In [90]:
recommendations = pd.DataFrame(recommendations)
recommendations.style

Unnamed: 0,id,user_id,smokes,orientation,body_type,diet,drinks,education,height,job,location,offspring,pets,religion,speaks,dislikes,likes
14,29307,16,0,straight,athletic,anything,1,university,64.0,Miscellaneous,"menlo park, california",doesn't want kids,doesn't have,0,english,"['Virtual Reality & Interactive Media', 'Brewing & Culinary Arts', 'Education & Academia', 'Environmentalism & Sustainability', 'Fishing & Boating']","['Board Games & Puzzles', 'Cooking & Baking', 'Literature & Writing', 'Gaming & Esports', 'Social Media & Networking']"
42,29335,44,0,straight,average,anything,1,university,69.0,Healthcare & Legal,"san mateo, california",wants kids,has both,1,english,"['Education & Academia', 'Textile Arts', 'Travel & Adventure', 'Astronomy & Space Exploration', 'Entrepreneurship & Business']","['Content Creation & Editing', 'Brewing & Culinary Arts', 'Engineering & Robotics', 'Comics & Animation', 'Creative Writing & Storytelling']"
23,29316,25,0,straight,athletic,anything,1,university,73.0,Miscellaneous,"menlo park, california",,doesn't have,1,english,"['Travel & Adventure', 'Pottery & Sculpture', 'Video Production & Streaming', 'Fishing & Boating', 'Cooking & Baking']","['Water Sports', 'Music & Performing Arts', 'Textile Arts', 'Digital Media & Marketing', 'Technology & Innovations']"
20,29313,22,0,straight,athletic,anything,1,university,73.0,Miscellaneous,"daly city, california",doesn't want kids,has both,1,english,"['Comics & Animation', 'Alternative Healing & Therapies', 'Fishing & Boating', 'Social Justice & Advocacy', 'Extreme Sports']","['Textile Arts', 'Engineering & Robotics', 'Technology & Innovations', 'Board Games & Puzzles', 'Volunteering & Community Service']"
18,29311,20,0,straight,athletic,anything,1,university,68.0,Miscellaneous,"berkeley, california",doesn't want kids,has both,1,english,"['Volunteering & Community Service', 'Outdoor & Nature Activities', 'Education & Academia', 'Animal Care & Welfare', 'Science & Research']","['Travel & Adventure', 'Racquet Sports', 'Entrepreneurship & Business', 'Law & Politics', 'Textile Arts']"
