## Simple Recommender with Manipulated Values -- Orientation Only

- Recommender Systems
    - Cosine similarity -- X + Y
- Modeling
    - OHE, cosine similarity, sort similarities within in function
    - **Simple model - Orientation only**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
import sys
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from category_encoders import OneHotEncoder

In [2]:
# read in full cupid data
cupid = pd.read_pickle('data/clean_cupid.pkl')
cupid.drop(columns = ['status', 'location'], inplace = True)

In [3]:
cupid.head(3)

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
0,22,m,straight,a little extra,strictly anything,socially,never,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,sometimes
1,35,m,straight,average,mostly other,often,sometimes,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,no
2,38,m,straight,thin,anything,socially,never,doesn't have kids,has cats,atheism,no


In [3]:
cupid_df = pd.read_pickle('data/grouped_cupid.pkl')

In [46]:
cupid_df.head(3)

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
0,22,m,straight,average,anything,sometimes,no,"doesn't have kid(s), but wants kid(s)",likes dogs and cats,agnosticism,sometimes
1,35,m,straight,average,other,yes,sometimes,"doesn't have kid(s), but wants kid(s)",likes dogs and cats,doesn't matter,no
2,38,m,straight,thin,anything,sometimes,no,doesn't have kids,likes cats,doesn't matter,no


---

#### Subset "Orientation" Datasets & OneHotEncode

In [270]:
# straight male
straight_male = cupid_df[(cupid_df['sex'] == 'm') & (cupid_df['orientation'] == 'straight')].head(3000)

# ohe
straight_male_encoded = OneHotEncoder(use_cat_names = True).fit_transform(straight_male)

# ---------------

# straight female
straight_female = cupid_df[(cupid_df['sex'] == 'f') & (cupid_df['orientation'] == 'straight')].head(3000)

# ohe
straight_female_encoded = OneHotEncoder(use_cat_names = True).fit_transform(straight_female)

# ---------------

# gay male
gay_male = cupid_df[(cupid_df['sex'] == 'm') & (cupid_df['orientation'] == 'gay')]

# ohe
gay_male_encoded = OneHotEncoder(use_cat_names = True).fit_transform(gay_male)

# ---------------

# gay female
gay_female = cupid_df[(cupid_df['sex'] == 'f') & (cupid_df['orientation'] == 'gay')]

# ohe
gay_female_encoded = OneHotEncoder(use_cat_names = True).fit_transform(gay_female)

# ---------------

# bi m/f
bi = cupid_df[cupid_df['orientation'] == 'bisexual']

# ohe
bi_encoded = OneHotEncoder(use_cat_names = True).fit_transform(bi)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


----

#### Recommender System

In [285]:
# THIS IS THE WAY JACOB MENTIONED

# .loc[index]

# straight female loking for straight male
similarity = cosine_similarity(pd.DataFrame(straight_female_encoded.loc[6]).T, straight_male_encoded).tolist()[0]
pd.DataFrame(similarity, columns = ['similarity'], index = straight_male_encoded.index).sort_values(by = 'similarity', ascending = False).iloc[:3]

Unnamed: 0,similarity
5030,0.999058
0,0.999021
634,0.998997


In [309]:
# test 4- OHE seperately

def lover_recommender_test4(sex, orientation, index):
    """
    index (int): user_id we're looking to partner with matching orientation
    sex (str): m, f
    orientation: straight, gay, bi/bisexual
    """
   
    # straight female looking for straight male
    if orientation == 'straight' and sex == 'f':
        similarity = cosine_similarity(pd.DataFrame(straight_female_encoded.loc[index]).T, straight_male_encoded).tolist()[0]
        return pd.DataFrame(similarity, columns = ['similarity'], index = straight_male_encoded.index).sort_values(by = 'similarity', ascending = False).iloc[:3]
    
    # straight male looking for straight female
    elif orientation == 'straight' and sex == 'm':
        # cosine_similarity
        similarity = cosine_similarity(pd.DataFrame(straight_male_encoded.loc[index]).T, straight_female_encoded).tolist()[0]
        return pd.DataFrame(similarity, columns = ['similarity'], index = straight_female_encoded.index).sort_values(by = 'similarity', ascending = False).iloc[:3]
    
    # gay male looking for gay male
    elif orientation == 'gay' and sex == 'm':
        # create sparse matrix
        gay_male_sparse = sparse.csr_matrix(gay_male_encoded)

        # cosine_similarity
        similarities_gay_male = cosine_similarity(gay_male_sparse)
        return gay_male_df[index].sort_values(ascending = False).iloc[1:4]
    
    # gay female looking for gay female
    elif orientation == 'gay' and sex == 'f':
        # create sparse matrix
        gay_female_sparse = sparse.csr_matrix(gay_female_encoded)

        # cosine_similarity
        similarities_gay_female = cosine_similarity(gay_female_sparse)
        return gay_female_df[index].sort_values(ascending = False).iloc[1:4]
    
    # bisexual male/female looking for bisexual male/female
    elif ('bi' in orientation and sex == 'f') or ('bi' in orientation and sex == 'm'):
        # create sparse matrix
        bi_sparse = sparse.csr_matrix(bi_encoded)

        # cosine_similarity
        similarities_bi = cosine_similarity(bi_sparse)
        return bi_df[index].sort_values(ascending = False).iloc[1:4]

In [369]:
lover_recommender_test4('m', 'straight', 2)
# lover_recommender_test4('f', 'straight', 6)
# lover_recommender_test4('m', 'gay', 55)

#lover_recommender_test4('f', 'bi', 37)

Unnamed: 0,similarity
3179,0.998985
4724,0.998759
2799,0.998747


In [370]:
cupid_df.loc[2]

age                           38
sex                            m
orientation             straight
body_type                   thin
diet                    anything
drinks                 sometimes
drugs                      never
offspring      doesn't have kids
pets                  likes cats
religion          doesn't matter
smokes                        no
Name: 2, dtype: object

In [371]:
cupid_df.loc[3179]

age                                31
sex                                 f
orientation                  straight
body_type              rather not say
diet                         anything
drinks                      sometimes
drugs                           never
offspring           doesn't have kids
pets           dislikes dogs and cats
religion                      matters
smokes                      sometimes
Name: 3179, dtype: object

In [314]:
cupid.loc[37]

age                                                25
sex                                                 m
orientation                                  bisexual
body_type                                         fit
diet                                  mostly anything
drinks                                       socially
drugs                                           never
offspring      doesn't have kids, but might want them
pets                        likes dogs and likes cats
religion                                      atheism
smokes                                             no
Name: 37, dtype: object

In [322]:
cupid.loc[11527]

age                                                27
sex                                                 m
orientation                                  bisexual
body_type                              rather not say
diet                                         anything
drinks                                       socially
drugs                                           never
offspring      doesn't have kids, but might want them
pets                     likes dogs and dislikes cats
religion                                      atheism
smokes                                             no
Name: 11527, dtype: object

---

#### If there were random inputs (like the app)

In [11]:
# function to ohe, create sparse matrices, and return the cosine similarity based on orientation

def invalue_to_similarity(invalue_df, orientation_df):
    """
    invalue_df: converted DataFrame of user inputs
    orientation_df: DataFrame of all people of that orientation
    """
    
    # concat input values to orientation df to prep for cosine similarity
    df = pd.concat([orientation_df, invalue_df])

    # ohe
    df_encoded = OneHotEncoder(use_cat_names = True).fit_transform(df)
    
    # make cosine_similarity input (input X)
    cosine_input = pd.DataFrame(df_encoded.iloc[-1]).T
    
    # drop last encoded row (input Y)
    df_encoded.drop(df_encoded.tail(1).index, inplace = True)
    
    # cosine_similarity
    similarity = cosine_similarity(cosine_input, df_encoded)
    
    # return top 5 matches
    top5 = pd.DataFrame(similarity.tolist()[0], columns = ['similarity'], index = df_encoded.index).sort_values(by = 'similarity', ascending = False).iloc[:5]
    
    # return top 5 matches in a df with cosine similarities
    results = pd.DataFrame(columns = cupid.columns)

    for i in top5.index:
        results = results.append(pd.DataFrame(cupid.loc[i]).T)

    matches = pd.merge(top5, results, on = top5.index)
    matches.rename(columns = {'key_0' : 'user_id'}, inplace = True)
    matches.set_index('user_id', drop = True, inplace = True)
    
    return matches

In [9]:
# test 5 -- using a new user input
# referenced https://stackoverflow.com/questions/44296648/using-lists-in-pandas-to-replace-column-names

def lover_recommender_test5(invalue):
    """
    invalue (list): survey/streamlit app responses
    """
   
    # convert input to DataFrame
    invalue_df = pd.DataFrame(invalue).T.rename(columns = {i:j for i,j in zip(np.arange(11), cupid_df.columns)})

    # ----------------
    
    # straight female looking for straight female
    if invalue_df['orientation'].unique()[0] == 'straight' and invalue_df['sex'].unique()[0] == 'f':
        
        # straight male
        straight_male = cupid_df[(cupid_df['sex'] == 'm') & (cupid_df['orientation'] == 'straight')].head(3000)
        
        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, straight_male)
    
    # straight male looking for straight male
    elif invalue_df['orientation'].unique()[0] == 'straight' and invalue_df['sex'].unique()[0] == 'm':
        
        # straight female
        straight_female = cupid_df[(cupid_df['sex'] == 'f') & (cupid_df['orientation'] == 'straight')].head(3000)

        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, straight_female)
    
    # gay male looking for gay male
    elif invalue_df['orientation'].unique()[0] == 'gay' and invalue_df['sex'].unique()[0] == 'm':
        
        # gay male
        gay_male = cupid_df[(cupid_df['sex'] == 'm') & (cupid_df['orientation'] == 'gay')]
        
        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, gay_male)
    
    # gay female looking for gay female
    elif invalue_df['orientation'].unique()[0] == 'gay' and invalue_df['sex'].unique()[0] == 'f':
        
        # gay female
        gay_female = cupid_df[(cupid_df['sex'] == 'f') & (cupid_df['orientation'] == 'gay')]
        
        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, gay_female)
    
    # bisexual male/female looking for bisexual male/female
    elif (invalue_df['orientation'].unique()[0] == 'bisexual' and invalue_df['sex'].unique()[0] == 'f') or \
         (invalue_df['orientation'].unique()[0] == 'bisexual' and invalue_df['sex'].unique()[0] == 'm'):
        
        # bi individual
        bi = cupid_df[cupid_df['orientation'] == 'bisexual']
        
        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, bi)

#### Ask questions to mimic random/app inputs

In [14]:
# input / up + down arrow
age = int(input('How old are you?'))

# dropdowns
sex = str(input('What gender do you identify as?'))
orientation = str(input('What sexual orientation do you identify as?'))
body_type = str(input("What's your body type?"))
diet = str(input('What does your diet consist of?'))
drinks = str(input('Do you consume alcoholic beverages?'))
drugs = str(input('Do you use drugs?'))
offspring = str(input('Do you have children and/or plan on having [more] children?'))
pets = str(input("What's your sentiment on dogs and/or cats"))
religion = str(input("Does religion matter to you?"))
smokes = str(input("Do you smoke?"))

invalue = np.array([age, sex, orientation, body_type, diet, drinks, drugs, offspring, pets, religion, smokes])

How old are you? 19
What gender do you identify as? f
What sexual orientation do you identify as? gay
What's your body type? rather not say
What does your diet consist of? vegan
Do you consume alcoholic beverages? no
Do you use drugs? yes
Do you have children and/or plan on having [more] children? doesn't have kids
What's your sentiment on dogs and/or cats likes dogs and cats
Does religion matter to you? atheism
Do you smoke? yes


In [19]:
# gay female
# invalue = [19, 'f', 'gay', 'rather not say', 'vegan', 'no', 'yes', "doesn't have kids", 'likes dogs and cats', 'atheism', 'yes']

lover_recommender_test5(invalue)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,similarity,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15731,0.636364,55,f,gay,rather not say,anything,not at all,never,doesn't have kids,likes dogs and likes cats,other and somewhat serious about it,yes
53666,0.636364,19,f,gay,rather not say,anything,not at all,often,doesn't have kids,has dogs,atheism,yes
57095,0.545455,33,f,gay,rather not say,mostly vegan,socially,sometimes,doesn't have kids,likes dogs and has cats,other and somewhat serious about it,sometimes
32158,0.545455,34,f,gay,rather not say,anything,not at all,never,doesn't have kids,has dogs and has cats,atheism,no
18040,0.545455,32,f,gay,average,other,not at all,never,doesn't have kids,likes dogs and likes cats,atheism and very serious about it,yes


#### Just trying additional random inputs

In [22]:
# straight female

invalue2 = [30, 'f', 'straight', 'rather not say', 'anything', 'yes', 'no', "doesn't have kids", 'likes dogs', 'atheism', 'no']

lover_recommender_test5(invalue2)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,similarity,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1897,0.727273,30,m,straight,rather not say,anything,socially,never,doesn't have kids,likes dogs,judaism but not too serious about it,no
2787,0.727273,30,m,straight,rather not say,anything,socially,never,doesn't have kids,likes dogs,christianity,no
1880,0.727273,30,m,straight,rather not say,anything,socially,never,doesn't have kids,has dogs,atheism,no
4470,0.727273,30,m,straight,rather not say,mostly anything,often,never,doesn't have kids,dislikes dogs and dislikes cats,catholicism but not too serious about it,no
3953,0.636364,30,m,straight,fit,anything,socially,never,doesn't have kids,has dogs,atheism,no


In [23]:
# straight male

invalue3 = [30, 'm', 'straight', 'thin', 'vegetarian', 'no', 'sometimes', "doesn't have kids", 'likes cats', 'catholicism', 'sometimes']

lover_recommender_test5(invalue3)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,similarity,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3176,0.545455,27,f,straight,thin,mostly vegetarian,socially,sometimes,doesn't have kids,likes dogs and likes cats,other and laughing about it,sometimes
6829,0.545455,31,f,straight,skinny,anything,not at all,sometimes,doesn't have kids,likes cats,agnosticism,no
2280,0.545455,23,f,straight,thin,anything,socially,sometimes,doesn't have kids,likes dogs and likes cats,catholicism,sometimes
3491,0.545455,43,f,straight,thin,strictly vegetarian,not at all,never,doesn't have kids,has cats,other and somewhat serious about it,no
3593,0.545455,30,f,straight,thin,mostly vegetarian,socially,never,doesn't have kids,likes dogs and likes cats,atheism,when drinking


In [33]:
# gay male

invalue3 = [22, 'm', 'gay', 'full figured', 'vegetarian', 'yes', 'yes', "has kids, and wants more", 'likes cats', 'agnosticism', 'yes']

lover_recommender_test5(invalue3)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,similarity,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
48119,0.636364,22,m,gay,rather not say,vegetarian,often,never,doesn't have kids,likes cats,agnosticism,no
52712,0.545455,22,m,gay,average,vegetarian,often,sometimes,doesn't have kids,has cats,other,no
4635,0.545455,35,m,gay,used up,strictly vegetarian,very often,often,doesn't have kids,dislikes dogs and dislikes cats,atheism,yes
39785,0.545455,22,m,gay,skinny,mostly anything,socially,sometimes,doesn't want kids,likes cats,agnosticism,yes
28870,0.454545,22,m,gay,skinny,mostly anything,often,never,doesn't have kids,dislikes dogs and dislikes cats,atheism but not too serious about it,yes


In [34]:
# bi male

invalue4 = [42, 'm', 'bisexual', 'average', 'vegan', 'no', 'yes', "has kids", 'dislikes dogs and cats', 'christianity', 'no']

lover_recommender_test5(invalue4)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,similarity,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
26185,0.636364,22,m,bisexual,a little extra,mostly vegan,not at all,never,doesn't have kids,dislikes dogs and dislikes cats,agnosticism and laughing about it,no
41153,0.636364,42,m,bisexual,average,anything,not at all,never,doesn't have kids,dislikes dogs and dislikes cats,atheism,no
5469,0.545455,38,m,bisexual,average,anything,not at all,never,doesn't have kids,dislikes dogs and dislikes cats,buddhism,no
48688,0.545455,22,m,bisexual,average,strictly vegan,often,never,doesn't have kids,dislikes dogs and dislikes cats,atheism and laughing about it,no
29542,0.545455,19,m,bisexual,average,mostly anything,not at all,never,doesn't have kids,has dogs and dislikes cats,christianity,no


In [37]:
# bi female

invalue5 = [27, 'f', 'bisexual', 'fit', 'anything', 'yes', 'yes', "wants kids", 'likes dogs', "atheism", 'yes']

lover_recommender_test5(invalue5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,similarity,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15285,0.636364,27,f,bisexual,athletic,mostly vegan,often,often,doesn't want kids,likes dogs and likes cats,atheism,yes
33549,0.636364,27,f,bisexual,curvy,anything,very often,sometimes,doesn't have kids,likes dogs and dislikes cats,agnosticism and laughing about it,yes
37521,0.636364,27,f,bisexual,athletic,mostly anything,often,sometimes,"doesn't have kids, but might want them",has dogs,atheism,sometimes
5151,0.636364,27,f,bisexual,fit,anything,often,never,doesn't want kids,likes dogs and dislikes cats,other and laughing about it,no
2362,0.636364,27,f,bisexual,curvy,strictly anything,very often,sometimes,doesn't have kids,likes dogs,agnosticism and laughing about it,yes
