In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
import sys
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from category_encoders import OneHotEncoder

In [2]:
# read in full cupid data
cupid = pd.read_pickle('data/clean_cupid.pkl')
cupid.drop(columns = ['status', 'location'], inplace = True)

In [3]:
cupid.head(3)

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
0,22,m,straight,a little extra,strictly anything,socially,never,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,sometimes
1,35,m,straight,average,mostly other,often,sometimes,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,no
2,38,m,straight,thin,anything,socially,never,doesn't have kids,has cats,atheism,no


In [54]:
# read in grouped cupid data
#cupid_religion = pd.read_pickle('data/cupid_religion.pkl')

cupid_df = pd.read_pickle('data/grouped_cupid.pkl')

In [46]:
cupid_df.head(3)

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
0,22,m,straight,average,anything,sometimes,no,"doesn't have kid(s), but wants kid(s)",likes dogs and cats,agnosticism,sometimes
1,35,m,straight,average,other,yes,sometimes,"doesn't have kid(s), but wants kid(s)",likes dogs and cats,doesn't matter,no
2,38,m,straight,thin,anything,sometimes,no,doesn't have kids,likes cats,doesn't matter,no


In [43]:
#cupid_religion.head(3)

---

#### Recommender Modeling

In [47]:
# function to ohe, create sparse matrices, and return the cosine similarity based on orientation

def invalue_to_similarity(invalue_df, orientation_df):
    """
    invalue_df: converted DataFrame of user inputs
    orientation_df: DataFrame of all people of that orientation
    """
    
    # concat input values to orientation df to prep for cosine similarity
    df = pd.concat([orientation_df, invalue_df])

    # ohe
    df_encoded = OneHotEncoder(use_cat_names = True).fit_transform(df)
    
    # make cosine_similarity input (input X)
    cosine_input = pd.DataFrame(df_encoded.iloc[-1]).T
    
    # drop last encoded row (input Y)
    df_encoded.drop(df_encoded.tail(1).index, inplace = True)
    
    # cosine_similarity
    similarity = cosine_similarity(cosine_input, df_encoded)
    
    # return top 5 matches
    top5 = pd.DataFrame(similarity.tolist()[0], columns = ['similarity'], index = df_encoded.index).sort_values(by = 'similarity', ascending = False).iloc[:5]
    
    # return top 5 matches in a df with cosine similarities
    results = pd.DataFrame(columns = cupid.columns)

    for i in top5.index:
        results = results.append(pd.DataFrame(cupid.loc[i]).T)

    matches = pd.merge(top5, results, on = top5.index)
    matches.rename(columns = {'key_0' : 'user_id'}, inplace = True)
    matches.set_index('user_id', inplace = True)
    
    return matches

In [48]:
def lover_recommender_test6(invalue, religion, lowest_age, highest_age):
    """
    invalue (list): survey/streamlit app responses
    df = based on conditional -- if religion matters
    """
   
    # convert input to DataFrame
    invalue_df = pd.DataFrame(invalue).T.rename(columns = {i:j for i,j in zip(np.arange(11), cupid_df.columns)})

    # ----------------
    
    # straight female looking for straight mmale
    if invalue_df['orientation'].unique()[0] == 'straight' and invalue_df['sex'].unique()[0] == 'f':
        
        # straight male
        straight_male = cupid_df[(cupid_df['sex'] == 'm') & (cupid_df['orientation'] == 'straight') & (cupid_df['religion'] == religion) & \
            (cupid_df['age'] >= lowest_age) & (cupid_df['age'] <= highest_age)].head(3000)
        
        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, straight_male)
    
    # straight male looking for straight female
    elif invalue_df['orientation'].unique()[0] == 'straight' and invalue_df['sex'].unique()[0] == 'm':
        
        # straight female
        straight_female = cupid_df[(cupid_df['sex'] == 'f') & (cupid_df['orientation'] == 'straight') & (cupid_df['religion'] == religion) & \
            (cupid_df['age'] >= lowest_age) & (cupid_df['age'] <= highest_age)].head(3000)

        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, straight_female)
    
    # gay male looking for gay male
    elif invalue_df['orientation'].unique()[0] == 'gay' and invalue_df['sex'].unique()[0] == 'm':
        
        # gay male
        gay_male = cupid_df[(cupid_df['sex'] == 'm') & (cupid_df['orientation'] == 'gay') & (cupid_df['religion'] == religion) & \
            (cupid_df['age'] >= lowest_age) & (cupid_df['age'] <= highest_age)]
        
        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, gay_male)
    
    # gay female looking for gay female
    elif invalue_df['orientation'].unique()[0] == 'gay' and invalue_df['sex'].unique()[0] == 'f':
        
        # gay female
        gay_female = cupid_df[(cupid_df['sex'] == 'f') & (cupid_df['orientation'] == 'gay') & (cupid_df['religion'] == religion) & \
            (cupid_df['age'] >= lowest_age) & (cupid_df['age'] <= highest_age)]
        
        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, gay_female)
    
    # bisexual male/female looking for bisexual male/female
    elif (invalue_df['orientation'].unique()[0] == 'bisexual' and invalue_df['sex'].unique()[0] == 'f') or \
         (invalue_df['orientation'].unique()[0] == 'bisexual' and invalue_df['sex'].unique()[0] == 'm'):
        
        # bi individual
        bi = cupid_df[(cupid_df['orientation'] == 'bisexual') & (cupid_df['religion'] == religion) & (cupid_df['age'] >= lowest_age) & (cupid_df['age'] <= highest_age)]
        
        # call 'invalue_to_similarity' function to return similarities
        return invalue_to_similarity(invalue_df, bi)

In [239]:
invalue_df = pd.DataFrame(invalue).T.rename(columns = {i:j for i,j in zip(np.arange(11), cupid_df.columns)})
invalue_df

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
0,19,f,gay,thin,vegan,yes,yes,doesn't want kids,likes cats,doesn't matter,no


In [241]:
invalue_df['sex'].unique()[0]

'f'

In [51]:
# input / up + down arrow
age = int(input('How old are you?'))
print("What's your age range?")
lowest_age = int(input('Lowest age?'))
highest_age = int(input('Highest age?'))

# dropdowns
sex = str(input('What gender do you identify as?'))
orientation = str(input('What sexual orientation do you identify as?'))
body_type = str(input("What's your body type?"))
diet = str(input('What does your diet consist of?'))
drinks = str(input('Do you consume alcoholic beverages?'))
drugs = str(input('Do you use drugs?'))
offspring = str(input('Do you have children and/or plan on having [more] children?'))
pets = str(input("What's your sentiment on dogs and/or cats"))
smokes = str(input("Do you smoke?"))

religion_matter = str(input('Does religion matter?'))
if religion_matter == 'yes':
    religion = str(input("What's your religion?"))
else:
    religion = "doesn't matter"

invalue = np.array([age, sex, orientation, body_type, diet, drinks, drugs, offspring, pets, religion, smokes])

How old are you? 36


What's your age range?


Lowest age? 30
Highest age? 45
What gender do you identify as? f
What sexual orientation do you identify as? straight
What's your body type? thin
What does your diet consist of? vegan
Do you consume alcoholic beverages? yes
Do you use drugs? yes
Do you have children and/or plan on having [more] children? doesn't want kids
What's your sentiment on dogs and/or cats likes dogs
Do you smoke? no
Does religion matter? yes
What's your religion? buddhism


In [38]:
# cupid_df = cupid_df[(cupid_df['age'] >= lowest_age) & (cupid_df['age'] <= highest_age)]
# cupid_df = cupid_df[cupid_df['religion'] == religion]
# lover_recommender_test6(invalue)

In [21]:
# if religion_matter == 'yes':
#     lover_recommender_test6(invalue, cupid_religion)
# else:
#     lover_recommender_test6(invalue, cupid_df)

  elif pd.api.types.is_categorical(cols):


In [52]:
lover_recommender_test6(invalue, religion, lowest_age, highest_age)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0_level_0,similarity,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11493,0.454545,32,m,straight,average,anything,socially,never,doesn't want kids,likes dogs,buddhism,no
51484,0.454545,42,m,straight,rather not say,anything,rarely,never,doesn't want kids,likes dogs,buddhism and somewhat serious about it,no
33159,0.454545,33,m,straight,fit,anything,socially,never,doesn't want kids,likes dogs,buddhism,no
17746,0.454545,31,m,straight,average,strictly vegan,socially,never,doesn't have kids,likes dogs,buddhism and somewhat serious about it,no
35422,0.454545,34,m,straight,fit,anything,often,never,doesn't have kids,likes dogs,buddhism,no


In [55]:
cupid_df['offspring'].value_counts()

doesn't have kids                          41250
doesn't have kid(s), but wants kid(s)       7303
has kid(s)                                  3627
doesn't want kids                           2686
doesn't have kids, and doesn't want any     1080
has kid(s), but doesn't want more            702
has kid(s) and wants more                    425
wants kid(s)                                 400
Name: offspring, dtype: int64