## Further Data Cleaning & Grouping Values

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
import sys
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from category_encoders import OneHotEncoder

In [2]:
# read in data
cupid = pd.read_pickle('data/clean_cupid.pkl')

In [18]:
cupid.head(3)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,location,offspring,pets,religion,smokes
0,22,single,m,straight,a little extra,strictly anything,socially,never,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,sometimes
1,35,single,m,straight,average,mostly other,often,sometimes,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,no
2,38,available,m,straight,thin,anything,socially,never,"san francisco, california",doesn't have kids,has cats,atheism,no


In [3]:
# drop location, since all entries are generally in NorCal
# and so recommender won't return an "exact" location match

# cupid['city'] = cupid['location'].str.split(",").str[0]

cupid.drop(columns = 'location', inplace = True)

In [4]:
# drop "status", since they're all single/available
cupid.drop(columns = 'status', inplace = True)

In [20]:
# make copy of data
cupid_df = cupid.copy()

In [425]:
cupid_df.head(3)

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
0,22,m,straight,a little extra,strictly anything,socially,never,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,sometimes
1,35,m,straight,average,mostly other,often,sometimes,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,no
2,38,m,straight,thin,anything,socially,never,doesn't have kids,has cats,atheism,no


In [83]:
cupid_df.shape

(57473, 11)

---

#### Begin grouping values

In [21]:
# clean up smokes to "when drinking" and "trying to quit" to "sometimes"
cupid_df.replace("when drinking", "sometimes", inplace = True)
cupid_df.replace("trying to quit", "sometimes", inplace = True)

In [70]:
cupid_df['smokes'].value_counts()

no           47405
sometimes     7940
yes           2128
Name: smokes, dtype: int64

In [22]:
# pet sentiment?

# "likes/has dogs and likes/has cats" -- "likes dogs and cats"
# "likes/has dogs" or "likes/has dogs and dislikes cats" -- "likes dogs"
# "likes/has cats" or "likes/has cats and dislikes dogs" -- "likes cats"
# "dislikes dogs and dislikes cats" -- "dislikes dogs and cats"

for value in list(cupid_df['pets'].unique()):
    if value == 'dislikes dogs and dislikes cats':
        cupid_df.replace(value, "dislikes dogs and cats", inplace = True)
    elif (value == 'likes dogs and likes cats') or (value == 'likes dogs and has cats') or (value == 'has dogs and likes cats') or (value == 'has dogs and has cats'):
        cupid_df.replace(value, "likes dogs and cats", inplace = True)
    elif ('dislikes cats' in value) or (value == 'likes dogs') or (value == 'has dogs'):
        cupid_df.replace(value, "likes dogs", inplace = True)
    elif ('dislikes dogs' in value) or (value == 'likes cats') or (value == 'has cats'):
        cupid_df.replace(value, "likes cats", inplace = True)

In [177]:
cupid_df['pets'].value_counts()

likes dogs and cats       21658
dislikes dogs and cats    19538
likes dogs                13978
likes cats                 2299
Name: pets, dtype: int64

In [23]:
# does diet matter?

# weird that 'other' changed to 'matters' somewhere in the data manipulation earlier
cupid_df['diet'].replace('matters', 'other', inplace = True)

# mostly/strictly anything, mostly/strictly vegitarian, mostly/strictly vegan, mostly/strictly other, mostly/strictly kosher, mostly/strictly halal
for value in list(cupid_df['diet'].unique()):
    if 'anything' in value:
        cupid_df.replace(value, "anything", inplace = True)
    elif 'vegetarian' in value:
        cupid_df.replace(value, "vegetarian", inplace = True)
    elif 'vegan' in value:
        cupid_df.replace(value, "vegan", inplace = True)
    elif 'kosher' in value:
        cupid_df.replace(value, "kosher", inplace = True)
    elif 'halal' in value:
        cupid_df.replace(value, "halal", inplace = True)
    elif 'other' in value:
        cupid_df.replace(value, "other", inplace = True)

In [64]:
cupid_df['diet'].value_counts()

anything      50168
vegetarian     4749
other          1719
vegan           656
kosher          110
halal            71
Name: diet, dtype: int64

In [24]:
# body_type grouping

# "average" or "a little extra" -- "average"
# "fit", "athletic", "jacked" -- "fit"
# "thin" or "skinny" -- "thin"
# "curvy", "full figured", "overweight" -- "full figured"
# "used up" -- "used up"
# "rather not say" -- "rather not say" [wild card]

for value in list(cupid_df['body_type'].unique()):
    if (value == "average") or (value == "a little extra"):
        cupid_df.replace(value, "average", inplace = True)
    elif (value == "fit") or (value == "athletic") or (value == "jacked"):
        cupid_df.replace(value, "fit", inplace = True)
    elif (value == "thin") or (value == "skinny"):
        cupid_df.replace(value, "thin", inplace = True)
    elif (value == "full figured") or (value == "curvy") or (value == "overweight"):
        cupid_df.replace(value, "full figured", inplace = True)
    else:
        continue

In [179]:
# body type
cupid_df['body_type'].value_counts()

fit               24303
average           16596
thin               6151
full figured       5054
rather not say     5040
used up             329
Name: body_type, dtype: int64

In [25]:
# drinking habits

# "socially" or "rarely" -- "sometimes"
# "often", "very often", "desperately" -- "yes"
# "not at all" -- "not at all"

for value in list(cupid_df['drinks'].unique()):
    if ('often' in value) or (value == "desperately"):
        cupid_df.replace(value, "yes", inplace = True)
    elif (value == "socially") or (value == "rarely"):
        cupid_df.replace(value, "sometimes", inplace = True)
    elif (value == "not at all"):
        cupid_df.replace(value, "no", inplace = True)

In [400]:
cupid_df['drinks'].value_counts()

sometimes    45835
no            6023
yes           5615
Name: drinks, dtype: int64

In [26]:
# drug habits

# cleaning "drugs" to maintain consistency with "yes/no/sometimes"
# never -- no

for value in list(cupid_df['drugs'].unique()):
    if value == "never":
        cupid_df.replace(value, "no", inplace = True)
    else:
        continue

In [432]:
cupid_df['drugs'].value_counts()

no           49856
sometimes     7228
yes            389
Name: drugs, dtype: int64

In [33]:
# has kids / has a kid -- "has kid(s)"
# wants kids / might want kids -- "wants kids"
# has a kid, and [might] wants more -- "has kid(s) and wants more"
# doesn't have kids, but [might] want them -- "doesn't have kid(s), but wants kids"
# has a kid / has kids, but doesn't want more -- "has kid(s), but doesn't want more"

for value in list(cupid_df['offspring'].unique()):
    if "doesn't have kids, but" in value:
        cupid_df.replace(value, "doesn't have kid(s), but wants kid(s)", inplace = True)
    elif (value == "has kids") or (value == "has a kid"):
        cupid_df.replace(value, "has kid(s)", inplace = True)
    elif (value == 'wants kids') or (value == 'might want kids'):
        cupid_df.replace(value, "wants kid(s)", inplace = True)
    elif "but doesn't want more" in value:
        cupid_df.replace(value, "has kid(s), but doesn't want more", inplace = True)
    elif ("might want more" in value) or ("wants more" in value):
        cupid_df.replace(value, "has kid(s) and wants more", inplace = True)
    else:
        continue

In [34]:
cupid_df['offspring'].value_counts()

doesn't have kids                          41250
doesn't have kid(s), but wants kid(s)       7303
has kid(s)                                  3627
doesn't want kids                           2686
doesn't have kids, and doesn't want any     1080
has kid(s), but doesn't want more            702
has kid(s) and wants more                    425
wants kid(s)                                 400
Name: offspring, dtype: int64

In [13]:
# write grouped values data to a new df -- BEFORE grouping religion
# this dataset would be used if religion MATTERS

# cupid_df.to_pickle('data/religion_cupid.pkl')

In [14]:
# read in religion data to further group/clean 'religion'
# cupid_religion = pd.read_pickle('data/religion_cupid.pkl')

In [15]:
# cupid_religion.head(3)

Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,offspring,pets,religion,smokes
0,22,m,straight,average,anything,sometimes,no,"doesn't have kid(s), but wants kid(s)",likes dogs and cats,agnosticism and very serious about it,sometimes
1,35,m,straight,average,other,yes,sometimes,"doesn't have kid(s), but wants kid(s)",likes dogs and cats,agnosticism but not too serious about it,no
2,38,m,straight,thin,anything,sometimes,no,doesn't have kids,likes cats,atheism,no


In [28]:
# # clean/group religion

# for value in list(cupid_religion['religion'].unique()):
#     if ('not too serious about it' in value) or ('laughing about it' in value) or ('atheism' in value):
#         cupid_religion.replace(value, "doesn't matter", inplace = True)
#     elif 'agnosticism' in value:
#         cupid_religion.replace(value, "agnosticism", inplace = True)
#     elif 'other' in value:
#         cupid_religion.replace(value, "other", inplace = True)
#     elif 'christianity' in value:
#         cupid_religion.replace(value, "christianity", inplace = True)
#     elif 'catholicism' in value:
#         cupid_religion.replace(value, "catholicism", inplace = True)
#     elif 'judaism' in value:
#         cupid_religion.replace(value, "judaism", inplace = True)
#     elif 'buddhism' in value:
#         cupid_religion.replace(value, "buddhism", inplace = True)
#     elif 'hinduism' in value:
#         cupid_religion.replace(value, "hinduism", inplace = True)
#     elif 'islam' in value:
#         cupid_religion.replace(value, "islam", inplace = True)

In [17]:
# cupid_religion['religion'].value_counts()

doesn't matter    43081
other              3908
agnosticism        3512
christianity       3361
catholicism        1682
judaism             873
buddhism            799
hinduism            176
islam                81
Name: religion, dtype: int64

In [19]:
# overwrite cupid_religion with updated grouped religion
# cupid_religion.to_pickle('data/religion_cupid.pkl')

In [447]:
# # does religion matter?

# # "laughing about it" or "not too serious about it" AS "doesn't matter"
# # group by "somewhat serious about it" or "serious about it" AS "matters"
# # all other values -- group by "matters"

# for value in list(cupid_df['religion'].unique()):
#     if ('not too serious about it' in value) or ('laughing about it' in value) or ('atheism' in value):
#         cupid_df.replace(value, "doesn't matter", inplace = True)
#     else:
#         cupid_df.replace(value, "matters", inplace = True)

In [29]:
# clean/group religion

for value in list(cupid_df['religion'].unique()):
    if ('not too serious about it' in value) or ('laughing about it' in value) or ('atheism' in value):
        cupid_df.replace(value, "doesn't matter", inplace = True)
    elif 'agnosticism' in value:
        cupid_df.replace(value, "agnosticism", inplace = True)
    elif 'other' in value:
        cupid_df.replace(value, "other", inplace = True)
    elif 'christianity' in value:
        cupid_df.replace(value, "christianity", inplace = True)
    elif 'catholicism' in value:
        cupid_df.replace(value, "catholicism", inplace = True)
    elif 'judaism' in value:
        cupid_df.replace(value, "judaism", inplace = True)
    elif 'buddhism' in value:
        cupid_df.replace(value, "buddhism", inplace = True)
    elif 'hinduism' in value:
        cupid_df.replace(value, "hinduism", inplace = True)
    elif 'islam' in value:
        cupid_df.replace(value, "islam", inplace = True)

In [30]:
cupid_df['religion'].value_counts()

doesn't matter    43081
other              3908
agnosticism        3512
christianity       3361
catholicism        1682
judaism             873
buddhism            799
hinduism            176
islam                81
Name: religion, dtype: int64

In [437]:
# write grouped values data to a new df -- AFTER grouping religion
# this dataset woudl be used if religion DOESN'T MATTER

cupid_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57473 entries, 0 to 59945
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   age          57473 non-null  int32   
 1   sex          57473 non-null  category
 2   orientation  57473 non-null  category
 3   body_type    57473 non-null  category
 4   diet         57473 non-null  category
 5   drinks       57473 non-null  category
 6   drugs        57473 non-null  category
 7   offspring    57473 non-null  category
 8   pets         57473 non-null  category
 9   religion     57473 non-null  category
 10  smokes       57473 non-null  category
dtypes: category(10), int32(1)
memory usage: 1.2 MB


In [35]:
cupid_df.to_pickle('data/grouped_cupid.pkl')

In [250]:
cupid_check = pd.read_pickle('data/grouped_cupid.pkl')

In [192]:
cupid_check.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57473 entries, 0 to 59945
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   age          57473 non-null  int32   
 1   sex          57473 non-null  category
 2   orientation  57473 non-null  category
 3   body_type    57473 non-null  category
 4   diet         57473 non-null  category
 5   drinks       57473 non-null  category
 6   drugs        57473 non-null  category
 7   offspring    57473 non-null  category
 8   pets         57473 non-null  category
 9   religion     57473 non-null  category
 10  smokes       57473 non-null  category
dtypes: category(10), int32(1)
memory usage: 1.2 MB


---