In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# read in data from pickle file

cupid_df = pd.read_pickle('data/subset_cupid.pkl')

In [53]:
cupid_df.head()

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,location,offspring,pets,religion,sign,smokes
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no
2,38,available,m,straight,thin,anything,socially,,graduated from masters program,"san francisco, california",,has cats,,pisces but it doesn&rsquo;t matter,no
3,23,single,m,straight,thin,vegetarian,socially,,working on college/university,"berkeley, california",doesn't want kids,likes cats,,pisces,no
4,29,single,m,straight,athletic,,socially,never,graduated from college/university,"san francisco, california",,likes dogs and likes cats,,aquarius,no


#### Further filtering data
- Dropping 'education' and 'sign' features, as I personally felt that the values were odd ('education') or not indicative of recommending a partner ('sign')
- Filtering 'status' to only 'single' or 'available', since we're recommending lovers and 'married' or 'seeing someone' already have a person of interest, assuming relationships aren't polyamorous

In [4]:
# drop education + sign -- useless / not indicative, in my opinion

cupid_df.drop(columns = ['education', 'sign'], inplace = True)

In [55]:
cupid_df.shape

(59946, 13)

In [56]:
cupid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   age          59946 non-null  category
 1   status       59946 non-null  category
 2   sex          59946 non-null  category
 3   orientation  59946 non-null  category
 4   body_type    54650 non-null  category
 5   diet         35551 non-null  category
 6   drinks       56961 non-null  category
 7   drugs        45866 non-null  category
 8   location     59946 non-null  category
 9   offspring    24385 non-null  category
 10  pets         40025 non-null  category
 11  religion     39720 non-null  category
 12  smokes       54434 non-null  category
dtypes: category(13)
memory usage: 836.6 KB


In [41]:
cupid_df['status'].value_counts()

single            55697
seeing someone     2064
available          1865
married             310
unknown              10
Name: status, dtype: int64

In [5]:
# filter for only those who are 'single' or 'available'

cupid = cupid_df[(cupid_df['status'] == 'single') | (cupid_df['status'] == 'available')]

In [59]:
# dropped ~2k rows

cupid.shape

(57562, 13)

#### Null Values

In [58]:
# check for nulls

cupid.isna().sum()

age                0
status             0
sex                0
orientation        0
body_type       4867
diet           23136
drinks          2918
drugs          13508
location           0
offspring      33881
pets           19384
religion       19656
smokes          5361
dtype: int64

In [80]:
# group 'has kids' and 'has a kid' -- has kid(s)
# group 'has kids, but doesn't want more' and 'has a kid, but doesn't want more' -- has kid(s) 

cupid['religion'].value_counts()

atheism                                       21706
agnosticism                                    2627
other                                          2614
agnosticism but not too serious about it       2488
agnosticism and laughing about it              2339
catholicism but not too serious about it       2261
other and laughing about it                    1983
atheism and laughing about it                  1938
christianity                                   1908
christianity but not too serious about it      1895
other but not too serious about it             1488
judaism but not too serious about it           1465
atheism but not too serious about it           1238
catholicism                                    1046
christianity and somewhat serious about it      894
other and somewhat serious about it             795
atheism and somewhat serious about it           760
catholicism and laughing about it               693
judaism and laughing about it                   642
buddhism but

In [6]:
# impute missing values

cupid['body_type'].fillna('rather not say', inplace = True)
cupid['diet'].fillna('anything', inplace = True)
cupid['drinks'].fillna('not at all', inplace = True)
cupid['drugs'].fillna('never', inplace = True)
cupid['offspring'].fillna("doesn't have kids", inplace = True)
cupid['pets'].fillna('dislikes dogs and dislikes cats', inplace = True)
cupid['religion'].fillna('atheism', inplace = True)
cupid['smokes'].fillna('no', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [7]:
# ensure null values are handled

cupid.isna().sum()

age            0
status         0
sex            0
orientation    0
body_type      0
diet           0
drinks         0
drugs          0
location       0
offspring      0
pets           0
religion       0
smokes         0
dtype: int64

In [15]:
# filter for age
# entries where age is 109 + 110

cupid['age'].value_counts()

26     3537
27     3518
28     3396
25     3393
29     3147
24     3106
30     3012
31     2619
23     2463
32     2446
33     2115
22     1848
34     1826
35     1683
36     1530
37     1381
38     1289
21     1217
39     1139
42     1049
40     1002
41      954
20      915
43      828
44      683
45      631
19      593
46      568
47      514
48      471
49      450
50      421
51      342
52      339
18      298
56      268
54      261
55      260
57      253
53      246
59      218
58      192
60      189
61      172
62      166
63      135
64      112
65      107
66      103
67       66
68       58
69       31
109       1
110       1
Name: age, dtype: int64

In [29]:
# change to data type int
cupid['age'] = cupid['age'].astype('int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [32]:
# filter ages 109 + 110
cupid = cupid[cupid['age'] < 109]

In [11]:
# # create an "ID" column that uniquely identifies each user/response
# cupid['id'] = np.arange(cupid.shape[0])

# # set id as index
# cupid.set_index('id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,age,status,sex,orientation,body_type,diet,drinks,drugs,location,offspring,pets,religion,smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,22,single,m,straight,a little extra,strictly anything,socially,never,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,sometimes
1,35,single,m,straight,average,mostly other,often,sometimes,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,no
2,38,available,m,straight,thin,anything,socially,never,"san francisco, california",doesn't have kids,has cats,atheism,no
3,23,single,m,straight,thin,vegetarian,socially,never,"berkeley, california",doesn't want kids,likes cats,atheism,no
4,29,single,m,straight,athletic,anything,socially,never,"san francisco, california",doesn't have kids,likes dogs and likes cats,atheism,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57557,59,single,f,straight,rather not say,anything,socially,never,"oakland, california",has kids,has dogs,catholicism but not too serious about it,no
57558,24,single,m,straight,fit,mostly anything,often,sometimes,"san francisco, california",doesn't have kids,likes dogs and likes cats,agnosticism,no
57559,42,single,m,straight,average,mostly anything,not at all,never,"south san francisco, california",doesn't have kids,dislikes dogs and dislikes cats,christianity but not too serious about it,no
57560,27,single,m,straight,athletic,mostly anything,socially,often,"san francisco, california","doesn't have kids, but wants them",likes dogs and likes cats,agnosticism but not too serious about it,trying to quit


In [72]:
# filter for location only in california
cupid = cupid[cupid['location'].str.contains('california')]

In [80]:
cupid['location'].value_counts()

san francisco, california              29918
oakland, california                     6886
berkeley, california                    3979
san mateo, california                   1291
palo alto, california                   1013
alameda, california                      868
san rafael, california                   733
hayward, california                      710
emeryville, california                   706
daly city, california                    663
redwood city, california                 654
san leandro, california                  620
walnut creek, california                 618
vallejo, california                      537
menlo park, california                   452
south san francisco, california          405
richmond, california                     399
mountain view, california                365
novato, california                       361
burlingame, california                   348
pleasant hill, california                333
castro valley, california                333
stanford, 

In [82]:
cupid.shape

(57473, 13)

In [90]:
cupid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57473 entries, 0 to 59945
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   age          57473 non-null  int32   
 1   status       57473 non-null  category
 2   sex          57473 non-null  category
 3   orientation  57473 non-null  category
 4   body_type    57473 non-null  category
 5   diet         57473 non-null  category
 6   drinks       57473 non-null  category
 7   drugs        57473 non-null  category
 8   location     57473 non-null  category
 9   offspring    57473 non-null  category
 10  pets         57473 non-null  category
 11  religion     57473 non-null  category
 12  smokes       57473 non-null  category
dtypes: category(12), int32(1)
memory usage: 1.4 MB


In [91]:
cupid.to_pickle('data/clean_cupid.pkl')

In [92]:
cupid_check = pd.read_pickle('data/clean_cupid.pkl')

In [93]:
cupid_check.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57473 entries, 0 to 59945
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   age          57473 non-null  int32   
 1   status       57473 non-null  category
 2   sex          57473 non-null  category
 3   orientation  57473 non-null  category
 4   body_type    57473 non-null  category
 5   diet         57473 non-null  category
 6   drinks       57473 non-null  category
 7   drugs        57473 non-null  category
 8   location     57473 non-null  category
 9   offspring    57473 non-null  category
 10  pets         57473 non-null  category
 11  religion     57473 non-null  category
 12  smokes       57473 non-null  category
dtypes: category(12), int32(1)
memory usage: 1.4 MB


----

#### Modeling without Grouping

In [45]:
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score

#### DBSCAN

In [None]:
# # instantiate pipe
# pipe_dbscan = make_pipeline(OneHoeEncoder(use_cat_names = True), StandardScaler(), DBSCAN(n_jobs = -1))

# # set params
# params = {'DBSCAN__eps' : [.75, 1, 1.25, 1.5, 1.75],
#           'DBSCAN__min_samples' : [3, 4, 5, 6]}

# # GridSearchCV
# gs_dbscan = GridSearchCV(pipe_dbscan, params, n_jobs = -1)

In [49]:
# ohe
ohe = OneHotEncoder(use_cat_names = True)
X_encoded = ohe.fit_transform(cupid)

# scale
ss = StandardScaler()
X_scaled = ss.fit_transform(X_encoded)

  elif pd.api.types.is_categorical(cols):


In [50]:
# referenced https://git.generalassemb.ly/DSI-322/8.02-lesson-DBSCAN/blob/master/dbscan-clustering-post-class.ipynb

# dbscan
dbscan = DBSCAN(n_jobs = -1, eps = 1.3, min_samples = 3)
dbscan.fit(X_scaled)

-0.31390046996452375

In [None]:
# check to see how many are of no cluster (-1) and fix eps
labels = dbscan.labels_
labels

In [None]:
# score
silhouette_score(X_scaled, labels)

In [51]:
# "gridsearch" for best DBSCAN params

def find_best_silhouette(df): 
    """select best eps and min_samples for a DBSCAN
    
    Args:
        df (pandas DataFrame): data to cluster

    Returns:
        None
    """
    
    max_score = -1
    
    # ohe
    ohe = OneHotEncoder(use_cat_names=True)
    X_encoded = ohe.fit_transform(df)

    # scale
    ss = StandardScaler()
    df_scaled = ss.fit_transform(X_encoded)
    
    for eps in np.linspace(.2, 5, 50):
        for minsamples in range(2, round(len(df) / 2)):
            dbscan = DBSCAN(eps = eps, min_samples = minsamples, n_jobs = -1)
            dbscan.fit(df_scaled)
            if len(set(dbscan.labels_)) > 1:
                score = silhouette_score(df_scaled, dbscan.labels_)
                if -1 in set(dbscan.labels_):
                    nclusters = len(set(dbscan.labels_)) - 1
                else: 
                    nclusters = len(set(dbscan.labels_))
                if score > max_score:
                    max_score = score
                    best_eps = eps
                    best_minsamples = minsamples
                    best_clusters = nclusters
                    
    print(f'Best silhouette score was {round(max_score, 2)}')
    print(f'Best eps was {round(best_eps, 2)}')
    print(f'Best min_samples was {best_minsamples}.')
    print(f'Model found {best_clusters} clusters.')
    
    return

In [52]:
find_best_silhouette(cupid)

  elif pd.api.types.is_categorical(cols):


KeyboardInterrupt: 

#### PCA

In [None]:
# scale, then poly or vice versa?

#poly
X_poly = PolynomialFeatures().fit_transform(X_scaled)

In [40]:
# instantiate
pca = PCA(random_state = 123).fit_transform(X_poly)

In [None]:
pca.explained_variance_

#### KMeans

In [None]:
# data has been scaled

# instantiate
kmeans = KMeans(n_clusters = 6, random_state = 123).fit(X_scaled)

In [None]:
# kmeans.labels_.sum() -- only works if there are 2 clusters - 0/1

# does this work?
labels = kmeans.labels_.value_counts()

In [None]:
# score 
silhouette_score(X_scaled, labels)

In [None]:
# find best k for clusters

silhouette = []
for k in range(2, 15):
    kmeans = KMeans(n_clusters=k, random_state=10)
    kmeans.fit(X_scaled)
    silhouette.append(silhouette_score(X_scaled, kmeans.labels_)

# plot
plt.plot(range(2, 15), silhouette, marker='o')
plt.title('Silhouette Score Elbow Plot');

#### Recommender System

In [38]:
from scipy import sparse
import sys
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

In [53]:
cupid.head(3)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,location,offspring,pets,religion,smokes,id
0,22,single,m,straight,a little extra,strictly anything,socially,never,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,sometimes,0
1,35,single,m,straight,average,mostly other,often,sometimes,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,no,1
2,38,available,m,straight,thin,anything,socially,never,"san francisco, california",doesn't have kids,has cats,atheism,no,2


In [None]:
# subset data
if cupid['sex'] == 'm' and cupid['orientation'] == 'straight':
    cupid_female = cupid[cupid['sex'] == 'f']
    # recommender
    # ohe
    # create sparse matrix
#     cupid_sparse = sparse.csr_matrix(cupid)

#     # cosine_similarity
#     similarities = cosine_similarity(cupid_sparse)
    
elif 

In [None]:
#pivot table
#piv_df = pd.pivot_table(cupid, index = 'title', columns = 'userId', values = 'rating')

In [37]:
# create sparse matrix
cupid_sparse = sparse.csr_matrix(cupid)

# cosine_similarity
similarities = cosine_similarity(cupid_sparse)

TypeError: no supported conversion for types: (dtype('O'),)