First, dealing with the categorical features

In [1]:
import pandas as pd
df = pd.read_csv('../data/dating_app_behavior.csv')
features_v1 = []

In [2]:
# Decided to remove all physical attribute columns to better align with my project’s focus on compatibility-based matching
df = df.drop(columns=['height_cm', 'weight_kg', 'body_type'])

In [3]:
def grouping_gender(g):
    if g == 'Female':
        return 'female'
    elif g == 'Male':
        return 'male'
    elif g in ['Transgender', 'Non-binary', 'Genderfluid']:
        return 'not_cis'
    else:
        return 'gender_not_specified'

df['gender_grouped'] = df['gender'].apply(grouping_gender)

In [4]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(dtype=int, sparse_output=False)
gender_encoded  = ohe.fit_transform(df[['gender_grouped']])

column_names = ohe.get_feature_names_out(['gender_grouped'])
column_names = [name.replace('gender_grouped_', '') for name in column_names]

genders_encoded_df = pd.DataFrame(gender_encoded, columns=column_names)
df = pd.concat([df, genders_encoded_df], axis=1)
features_v1 = features_v1 + column_names 


In [5]:
def grouping_sexual_orientation(g):
    if g == 'Straight':
        return 'heterosexual'
    elif g in ['Gay', 'Lesbian']:
        return 'homosexual'
    elif g == 'Queer':
        return 'queer'
    elif g == 'Asexual':
        return 'asexual'
    else:
        return 'bi/pan/demi_sexual'

df['sexual_orientation_grouped'] = df['sexual_orientation'].apply(grouping_sexual_orientation)

In [6]:
gender_encoded  = ohe.fit_transform(df[['sexual_orientation_grouped']])

column_names = ohe.get_feature_names_out(['sexual_orientation_grouped'])
column_names = [name.replace('sexual_orientation_grouped_', '') for name in column_names]

sexual_orientation_encoded_df = pd.DataFrame(gender_encoded, columns=column_names)
df = pd.concat([df, sexual_orientation_encoded_df], axis=1)
features_v1 = features_v1 + column_names 

In [7]:
def grouping_income(g):
    if g in ['Low', 'Very Low']:
        return 'low'
    elif g in ['Lower-Middle', 'Middle', 'Upper-Middle']:
        return 'middle'
    else:
        return 'high'

df['income_bracket_grouped'] = df['income_bracket'].apply(grouping_income)

In [8]:
income_mapping = {
    'low': 0,
    'middle': 1,
    'high': 2
}
df['income'] = df['income_bracket_grouped'].map(income_mapping)
features_v1.append('income')

In [9]:
def grouping_relationship_intent(g):
    if g in ['Hookups', 'Casual Dating']:
        return 'casual_dating'
    elif g == 'Serious Relationship':
        return 'serious_dating'
    else:
        return 'non_romantic_relationship'

df['relationship_intent_grouped'] = df['relationship_intent'].apply(grouping_relationship_intent)

In [10]:
intent_encoded  = ohe.fit_transform(df[['relationship_intent_grouped']])

column_names = ohe.get_feature_names_out(['relationship_intent_grouped'])
column_names = [name.replace('relationship_intent_grouped_', '') for name in column_names]

intent_encoded_df = pd.DataFrame(intent_encoded, columns=column_names)
df = pd.concat([df, intent_encoded_df], axis=1)

features_v1 = features_v1 + column_names 

In [11]:
def grouping_education_level(g):
    if g in ['No Formal Education', 'High School', 'Diploma']:
        return 'basic_education'
    elif g in ['Bachelor’s', 'Associate’s']:
        return 'undergrad'
    else:
        return 'graduate'

df['education_level_grouped'] = df['education_level'].apply(grouping_education_level)

In [12]:
education_encoded  = ohe.fit_transform(df[['education_level_grouped']])

column_names = ohe.get_feature_names_out(['education_level_grouped'])
column_names = [name.replace('education_level_grouped_', '') for name in column_names]

education_encoded_df = pd.DataFrame(education_encoded, columns=column_names)
df = pd.concat([df, education_encoded_df], axis=1)
features_v1 = features_v1 + column_names 

In [13]:
def grouping_location_type(g):
    if g in ['Urban', 'Metro']:
        return 'urban_location'
    elif g in ['Suburban', 'Small Town']:
        return 'semi_urban_location'
    else:
        return 'rural_location'

df['location_type_grouped'] = df['location_type'].apply(grouping_location_type)

In [14]:
location_type_encoded  = ohe.fit_transform(df[['location_type_grouped']])

column_names = ohe.get_feature_names_out(['location_type_grouped'])
column_names = [name.replace('location_type_grouped_', '') for name in column_names]

location_type_encoded_df = pd.DataFrame(location_type_encoded, columns=column_names)
df = pd.concat([df, location_type_encoded_df], axis=1)
features_v1 = features_v1 + column_names 

In [15]:
def grouping_zodiac_signs(g):
    if g in ['Aquarius', 'Gemini', 'Libra']:
        return 'air_zodiac_sign'
    elif g in ['Aries', 'Leo', 'Sagittarius']:
        return 'fire_zodiac_sign'
    elif g in ['Scorpio', 'Pisces', 'Cancer']:
        return 'water_zodiac_sign'
    else:
        return 'earth_zodiac_sign'

df['zodiac_sign_grouped'] = df['zodiac_sign'].apply(grouping_zodiac_signs)

In [16]:
zodiac_sign_encoded  = ohe.fit_transform(df[['zodiac_sign_grouped']])

column_names = ohe.get_feature_names_out(['zodiac_sign_grouped'])
column_names = [name.replace('zodiac_sign_grouped_', '') for name in column_names]

zodiac_sign_encoded_df = pd.DataFrame(zodiac_sign_encoded, columns=column_names)
df = pd.concat([df, zodiac_sign_encoded_df], axis=1)
features_v1 = features_v1 + column_names 

In [17]:
numerical = ['age','bio_length', 'likes_received', 'emoji_usage_rate','message_sent_count' ]
features_v1 = features_v1 + numerical
df[features_v1]

Unnamed: 0,female,gender_not_specified,male,not_cis,asexual,bi/pan/demi_sexual,heterosexual,homosexual,queer,income,...,urban_location,air_zodiac_sign,earth_zodiac_sign,fire_zodiac_sign,water_zodiac_sign,age,bio_length,likes_received,emoji_usage_rate,message_sent_count
0,0,1,0,0,0,0,0,1,0,2,...,1,0,1,0,0,56,44,173,0.36,75
1,0,0,1,0,0,1,0,0,0,1,...,0,0,0,1,0,40,301,107,0.42,35
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,30,309,91,0.41,33
3,0,0,0,1,0,0,0,1,0,0,...,1,0,1,0,0,57,35,147,0.07,5
4,0,0,1,0,0,1,0,0,0,1,...,1,1,0,0,0,24,343,94,0.11,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,0,0,1,0,0,0,1,0,2,...,1,0,0,0,1,45,235,99,0.19,9
49996,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,26,278,173,0.23,69
49997,0,0,1,0,0,1,0,0,0,2,...,0,0,0,0,1,45,128,123,0.36,94
49998,0,0,0,1,0,0,0,0,1,0,...,1,1,0,0,0,29,206,149,0.36,53


In [18]:
X_df = pd.DataFrame(df[features_v1], columns=features_v1)
X_df.to_csv('../data/X_df.csv', index=False)

In [19]:
# Getting all the different tags available so I can group it
df['interest_tags_clean'] = df['interest_tags'].str.split(r',\s*')

unique_interests = set(tag.strip() for sublist in df['interest_tags_clean'] for tag in sublist)
print(sorted(unique_interests))


['Anime', 'Art', 'Astrology', 'Binge-Watching', 'Board Games', 'Cars', 'Clubbing', 'Coding', 'Cooking', 'Crafting', 'DIY', 'Dancing', 'Fashion', 'Fitness', 'Foodie', 'Gaming', 'Gardening', 'Hiking', 'History', 'Investing', 'K-pop', 'Languages', 'MMA', 'Makeup', 'Meditation', 'Memes', 'Motorcycling', 'Movies', 'Music', 'Painting', 'Parenting', 'Pets', 'Photography', 'Podcasts', 'Poetry', 'Politics', 'Reading', 'Running', 'Skating', 'Sneaker Culture', 'Social Activism', 'Spirituality', 'Stand-up Comedy', 'Startups', 'Tattoos', 'Tech', 'Traveling', 'Writing', 'Yoga']


In [20]:
interest_map = {
    'Fitness': 'Health',
    'Yoga': 'Health',
    'Meditation': 'Health',
    'Running': 'Health',
    'Hiking': 'Health',
    'MMA': 'Health',
    'Skating': 'Health',
    'Dancing': 'Health',
    
    'Painting': 'Creativity',
    'Writing': 'Creativity',
    'Poetry': 'Creativity',
    'Art': 'Creativity',
    'Crafting': 'Creativity',
    'DIY': 'Creativity',
    'Photography': 'Creativity',
    'Fashion': 'Creativity',
    'Makeup': 'Creativity',
    'Tattoos': 'Creativity',
    
    
    'Movies': 'Entertainment',
    'Music': 'Entertainment',
    'Podcasts': 'Entertainment',
    'Stand-up Comedy': 'Entertainment',
    'Memes': 'Entertainment',
    'Binge-Watching': 'Entertainment',
    'Anime': 'Entertainment',
    'K-pop': 'Entertainment',
    'Gaming': 'Entertainment',
    'Board Games': 'Entertainment',
    
    'Reading': 'Education_Culture',
    'History': 'Education_Culture',
    'Languages': 'Education_Culture',
    'Politics': 'Education_Culture',
    'Coding': 'Education_Culture',
    'Tech': 'Education_Culture',
    'Startups': 'Education_Culture',
    'Investing': 'Education_Culture',

    'Foodie': 'Lifestyle',
    'Traveling': 'Lifestyle',
    'Pets': 'Lifestyle',
    'Parenting': 'Lifestyle',
    'Sneaker Culture': 'Lifestyle',
    'Gardening': 'Lifestyle',
    'Cooking': 'Lifestyle',
    'Cars': 'Lifestyle',
    'Motorcycling': 'Lifestyle',

    'Spirituality': 'Social',
    'Astrology': 'Social',
    'Social Activism': 'Social',
    'Clubbing': 'Social',
}