In [23]:
import pandas as pd
import plotly.express as px
import numpy as np

In [3]:
# Read and save dataset into 
cupid_df = pd.read_csv("../data/okcupid_profiles.csv")

In [4]:
# Sampling the rows
cupid_df.sample(3)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
46339,35,available,m,bisexual,average,,rarely,never,graduated from college/university,,...,i gave up writing years ago .. it scares the s...,i lost something a number of years ago .. didn...,i'm not sure i can depend on me anymore .. i r...,are usually wrong.,(e) all of the above.,have already been taken from me which makes t...,the visage outlined on this page is not what i...,temporally proximal to both thursday and saturday,"this is a test .. chances are, given the empi...",you have read my entire profile .. and you are...
15654,39,single,f,straight,a little extra,strictly vegetarian,socially,never,graduated from college/university,hispanic / latin,...,90% heroine 3% damsel in distress 7% devil in ...,"i am artistic, creative, easy-going, humorous,...",spending time on things that make my my life b...,"women always compliment my shoes, clothes & ha...","she reads! margret atwood, chuck palahniuk, wi...",1. dogs 2. tabasco sauce 3. lipstick 4. intern...,making more money while doing less work how i ...,i like gallery openings and dinner with a tabl...,"no money, man, can win my love. it's sweetness...",* you have never found yourself naked in publi...
40489,29,single,m,straight,fit,anything,socially,sometimes,graduated from college/university,asian,...,"i was born and raised in hawaii, and moved to ...","currently, i am working with a local accountin...",finishing everyone's leftovers at the dinner t...,how friendly i am.,"books: anything by malcolm gladwell, random bu...",1. family and friends 2. good food 3. running ...,the future.,hanging out with friends over a good meal and ...,,


In [5]:
# Check dataset data types
cupid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smok

In [107]:
# Duplicated rows
cupid_df.duplicated().sum()

0

In [109]:
# Null cells
cupid_df.isnull().sum()

age                0
status             0
sex                0
orientation        0
body_type       5296
diet           24395
drinks          2985
drugs          14080
education       6628
ethnicity       5680
height             3
income             0
job             8198
last_online        0
location           0
offspring      35561
pets           19921
religion       20226
sign           11056
smokes          5512
speaks            50
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
dtype: int64

In [6]:
# 
cupid_df.describe()

Unnamed: 0,age,height,income
count,59946.0,59943.0,59946.0
mean,32.34029,68.295281,20033.222534
std,9.452779,3.994803,97346.192104
min,18.0,1.0,-1.0
25%,26.0,66.0,-1.0
50%,30.0,68.0,-1.0
75%,37.0,71.0,-1.0
max,110.0,95.0,1000000.0


In [7]:
# Count the number of  unique values
cupid_df.nunique()

age               54
status             5
sex                2
orientation        3
body_type         12
diet              18
drinks             6
drugs              3
education         32
ethnicity        217
height            60
income            13
job               21
last_online    30123
location         199
offspring         15
pets              15
religion          45
sign              48
smokes             5
speaks          7647
essay0         54347
essay1         51516
essay2         48625
essay3         43520
essay4         49257
essay5         48961
essay6         43583
essay7         45548
essay8         39323
essay9         45440
dtype: int64

In [71]:
# Seeing/counting all the labels
for c in cupid_df.columns:    
    print (f"----< {c} >----")
    print(cupid_df[c].value_counts())

----< age >----
26     3724
27     3685
28     3583
25     3531
29     3295
24     3242
30     3149
31     2735
23     2592
32     2587
33     2206
22     1934
34     1902
35     1755
36     1583
37     1427
38     1330
21     1282
39     1172
42     1072
40     1030
41      980
20      953
43      858
44      708
45      643
19      611
46      578
47      529
48      481
49      459
50      437
51      350
52      344
18      309
56      271
54      267
55      265
57      256
53      252
59      221
58      197
60      195
61      176
62      167
63      138
64      113
65      109
66      105
67       66
68       59
69       31
110       1
109       1
Name: age, dtype: int64
----< status >----
single            55697
seeing someone     2064
available          1865
married             310
unknown              10
Name: status, dtype: int64
----< sex >----
m    35829
f    24117
Name: sex, dtype: int64
----< orientation >----
straight    51606
gay          5573
bisexual     2767
Name: 

In [52]:
# Accounting for number of unique types of each categorical columns
# This unq_val_count Series will be used for String look up through <essayX> columns
unq_val_count={c:pd.DataFrame(cupid_df[c].value_counts()) for c in cupid_df.columns}
unq_val_count = pd.Series(unq_val_count)
unq_val_count

age                  age
26   3724
27   3685
28   3583
25   3...
status                         status
single           55697
...
sex                                        sex
m  35829
f  24117
orientation              orientation
straight        51606
ga...
body_type                      body_type
average             ...
diet                                 diet
mostly anything    ...
drinks                      drinks
socially      41780
rarely...
drugs                     drugs
never      37724
sometimes   ...
education                                         education
g...
ethnicity                                                    ...
height               height
70.0    6074
68.0    5449
67.0   ...
income                   income
-1         48442
 20000      ...
job                                                job
other ...
last_online                      last_online
2012-06-29-22-56...
location                                       location
san f...
offspring                

In [64]:
# age               54, sex                2, drinks             6, drugs              3, education         32, ethnicity        217

orientation_df = cupid_df['orientation'].value_counts()
fig=px.treemap(orientation_df,
               path=[orientation_df.index],
               values=orientation_df.values,
               title="Orientation distribution",
               color=orientation_df.index,
               color_discrete_sequence=px.colors.sequential.PuBuGn,
               template='plotly_dark',
               width=1000, height=500
              )
percents = np.round((100*orientation_df.values / sum(orientation_df.values)).tolist(),2)
fig.data[0].customdata = [percents[0],percents[1],percents[2]]
fig.data[0].texttemplate = '%{label}<br>%{value}<br>%{customdata}%'
fig.update_layout(font=dict(size=19,family="Franklin Gothic"))

fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [62]:
# age               54, sex                2, drinks             6, drugs              3, education         32, ethnicity        217

gender_df = cupid_df['sex'].value_counts()
fig=px.treemap(gender_df,
               path=[gender_df.index],
               values=gender_df.values,
               title="Gender distribution",
               color=gender_df.index,
               color_discrete_sequence=px.colors.sequential.PuBuGn,
               template='plotly_dark',
               width=1000, height=500
              )

percents = np.round((100*gender_df.values / sum(gender_df.values)).tolist(),2)
fig.data[0].customdata = [percents[0],percents[1]]
fig.data[0].texttemplate = '%{label}<br>%{value}<br>%{customdata}%'
fig.update_layout(font=dict(size=19,family="Franklin Gothic"))
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [68]:
age_df=cupid_df['age'].value_counts()
fig=px.treemap(age_df,
               path=[age_df.index],
               values=age_df.values,
               title="Age distribution",
               color=age_df.index,
               color_discrete_sequence=px.colors.sequential.PuBuGn,
               template='plotly_dark',
               width=1000, height=500
              )

percents = np.round((100*age_df.values / sum(age_df.values)).tolist(),2)
fig.data[0].customdata = percents
fig.data[0].texttemplate = '%{label}<br>%{value}<br>%{customdata}%'
fig.update_layout(font=dict(size=19,family="Franklin Gothic"))
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [81]:
# Revised education column, using custom created csv 
# through self labeling using the most conservative judgement
edu_cat_df = pd.read_csv("../data/education_category_revised.csv")
edu_cat_df

Unnamed: 0,original,revised
0,graduated from college/university,Associate's Degree
1,graduated from masters program,Master's Degree
2,working on college/university,Some College
3,working on masters program,Bachelor's Degree
4,graduated from two-year college,Associate's Degree
5,graduated from high school,High school
6,graduated from ph.d program,Doctorate Degree
7,graduated from law school,Law degree
8,working on two-year college,Some College
9,dropped out of college/university,Some College


In [111]:
speak_df = cupid_df['speaks'].value_counts()
for items in speak_df.items():
    print(items)

('english', 21828)
('english (fluently)', 6628)
('english (fluently), spanish (poorly)', 2059)
('english (fluently), spanish (okay)', 1917)
('english (fluently), spanish (fluently)', 1288)
('english, spanish', 859)
('english (fluently), french (poorly)', 756)
('english, spanish (okay)', 655)
('english, spanish (poorly)', 609)
('english (fluently), chinese (fluently)', 535)
('english (fluently), french (okay)', 532)
('english (fluently), chinese (okay)', 430)
('english (poorly)', 310)
('english, chinese', 306)
('english (okay)', 306)
('english (fluently), german (poorly)', 263)
('english, french (poorly)', 245)
('english (fluently), french (fluently)', 215)
('english, french (okay)', 210)
('english, french', 209)
('english, spanish (fluently)', 198)
('english (fluently), japanese (poorly)', 184)
('english (fluently), chinese (poorly)', 175)
('english (fluently), german (okay)', 160)
('english (fluently), russian (fluently)', 147)
('english (fluently), spanish (okay), french (poorly)', 1

In [9]:
# cupid_df.rename(columns = {
#                         "essay0":"My self summary", 
#                         "essay1":"What I’m doing with my life",
#                         "essay2":"I’m really good at",
#                         "essay3":"The first thing people usually notice about me",
#                         "essay4":"Favorite books, movies, show, music, and food",
#                         "essay5":"The six things I could never do without",
#                         "essay6":"I spend a lot of time thinking about",
#                         "essay7":"On a typical Friday night I am",
#                         "essay8":"The most private thing I am willing to admit",
#                         "essay9":"You should message me if"})

In [10]:
#Combining essays columns
essays_df = cupid_df[['essay0','essay1','essay2','essay3','essay4','essay5','essay6','essay7','essay8','essay9']]

In [11]:
#     print (f"YOU'RE AT INDEX {index}")
#     for x in row:
#         print(f"--------{x}")