First, dealing with the categorical features

In [244]:
import pandas as pd
df = pd.read_csv('../data/dating_app_behavior.csv')
features_v1 = []

In [245]:
df.head(7)

Unnamed: 0,gender,sexual_orientation,location_type,income_bracket,education_level,interest_tags,app_usage_time_min,app_usage_time_label,swipe_right_ratio,swipe_right_label,likes_received,mutual_matches,profile_pics_count,bio_length,message_sent_count,emoji_usage_rate,last_active_hour,swipe_time_of_day,match_outcome,age,height_cm,weight_kg,zodiac_sign,body_type,relationship_intent
0,Prefer Not to Say,Gay,Urban,High,Bachelor’s,"Fitness, Politics, Traveling",52,Moderate,0.6,Optimistic,173,23,4,44,75,0.36,13,Early Morning,Mutual Match,56,149,40.6,Taurus,Curvy,Friends Only
1,Male,Bisexual,Suburban,Upper-Middle,No Formal Education,"Languages, Fashion, Parenting",279,Extreme User,0.56,Optimistic,107,7,3,301,35,0.42,0,Morning,Chat Ignored,40,155,69.7,Leo,Plus Size,Hookups
2,Non-binary,Pansexual,Suburban,Low,Master’s,"Movies, Reading, DIY",49,Moderate,0.41,Optimistic,91,27,2,309,33,0.41,1,After Midnight,Date Happened,30,185,96.9,Sagittarius,Curvy,Serious Relationship
3,Genderfluid,Gay,Metro,Very Low,Postdoc,"Coding, Podcasts, History",185,Extreme User,0.32,Balanced,147,6,5,35,5,0.07,21,Morning,No Action,57,154,49.3,Taurus,Slim,Exploring
4,Male,Bisexual,Urban,Middle,Bachelor’s,"Clubbing, Podcasts, Cars",83,High,0.32,Balanced,94,11,1,343,34,0.11,22,After Midnight,One-sided Like,24,149,40.0,Libra,Slim,Casual Dating
5,Genderfluid,Lesbian,Small Town,High,Associate’s,"Anime, Clubbing, DIY",138,Addicted,0.62,Optimistic,163,22,4,112,87,0.18,10,Early Morning,Blocked,39,167,53.3,Virgo,Curvy,Exploring
6,Male,Asexual,Metro,Lower-Middle,High School,"Tech, Makeup, Parenting",161,Addicted,0.38,Balanced,54,20,3,202,82,0.32,14,Morning,No Action,44,168,81.3,Capricorn,Slim,Serious Relationship


In [246]:
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', 1000)  

In [247]:
# Decided to remove all physical attribute columns to better align with my project’s focus on compatibility-based matching
df = df.drop(columns=['height_cm', 'weight_kg', 'body_type'])

In [248]:
def grouping_gender(g):
    if g == 'Female':
        return 'female'
    elif g == 'Male':
        return 'male'
    elif g in ['Transgender', 'Non-binary', 'Genderfluid']:
        return 'not_cis'
    else:
        return 'gender_not_specified'

df['gender_grouped'] = df['gender'].apply(grouping_gender)

In [249]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(dtype=int, sparse_output=False)
gender_encoded  = ohe.fit_transform(df[['gender_grouped']])

column_names = ohe.get_feature_names_out(['gender_grouped'])
column_names = [name.replace('gender_grouped_', '') for name in column_names]

genders_encoded_df = pd.DataFrame(gender_encoded, columns=column_names)
df = pd.concat([df, genders_encoded_df], axis=1)
features_v1 = features_v1 + column_names 


In [250]:
def grouping_sexual_orientation(g):
    if g == 'Straight':
        return 'heterosexual'
    elif g in ['Gay', 'Lesbian']:
        return 'homosexual'
    elif g == 'Asexual':
        return 'asexual'
    else:
        return 'bi/pan/demi_sexual/queer'

df['sexual_orientation_grouped'] = df['sexual_orientation'].apply(grouping_sexual_orientation)

In [251]:
gender_encoded  = ohe.fit_transform(df[['sexual_orientation_grouped']])

column_names = ohe.get_feature_names_out(['sexual_orientation_grouped'])
column_names = [name.replace('sexual_orientation_grouped_', '') for name in column_names]

sexual_orientation_encoded_df = pd.DataFrame(gender_encoded, columns=column_names)
df = pd.concat([df, sexual_orientation_encoded_df], axis=1)
features_v1 = features_v1 + column_names 

In [252]:
def grouping_income(g):
    if g in ['Low', 'Very Low']:
        return 0
    elif g in ['Lower-Middle', 'Middle', 'Upper-Middle']:
        return 1
    else:
        return 2

df['income'] = df['income_bracket'].apply(grouping_income)
features_v1.append('income')

In [253]:
def grouping_relationship_intent(g):
    if g == 'Serious Relationship':
        return 2
    elif g in ['Hookups', 'Casual Dating', 'Exploring']:
        return 1
    else:
        return 0

df['romantic_relationship_intent'] = df['relationship_intent'].apply(grouping_relationship_intent)
features_v1.append('romantic_relationship_intent')

In [254]:
def grouping_education_level(g):
    if g in ['No Formal Education', 'High School', 'Diploma']:
        return 0
    elif g in ['Bachelor’s', 'Associate’s']:
        return 1
    else:
        return 2

df['education_level_grouped'] = df['education_level'].apply(grouping_education_level)
features_v1.append('education_level_grouped')

In [255]:
def grouping_location_type(g):
    if g in ['Urban', 'Metro']:
        return 2
    elif g in ['Suburban', 'Small Town']:
        return 1
    else:
        return 0

df['location_density'] = df['location_type'].apply(grouping_location_type)
features_v1.append('location_density')

In [256]:
def grouping_zodiac_signs(g):
    if g in ['Aquarius', 'Gemini', 'Libra']:
        return 'air_zodiac_sign'
    elif g in ['Aries', 'Leo', 'Sagittarius']:
        return 'fire_zodiac_sign'
    elif g in ['Scorpio', 'Pisces', 'Cancer']:
        return 'water_zodiac_sign'
    else:
        return 'earth_zodiac_sign'

df['zodiac_sign_grouped'] = df['zodiac_sign'].apply(grouping_zodiac_signs)

In [257]:
zodiac_sign_encoded  = ohe.fit_transform(df[['zodiac_sign_grouped']])

column_names = ohe.get_feature_names_out(['zodiac_sign_grouped'])
column_names = [name.replace('zodiac_sign_grouped_', '') for name in column_names]

zodiac_sign_encoded_df = pd.DataFrame(zodiac_sign_encoded, columns=column_names)
df = pd.concat([df, zodiac_sign_encoded_df], axis=1)
features_v1 = features_v1 + column_names 

In [258]:
# Getting all the different tags available so I can group it
df['interest_tags_clean'] = df['interest_tags'].str.split(r',\s*')

unique_interests = set(tag.strip() for sublist in df['interest_tags_clean'] for tag in sublist)
print(sorted(unique_interests))

['Anime', 'Art', 'Astrology', 'Binge-Watching', 'Board Games', 'Cars', 'Clubbing', 'Coding', 'Cooking', 'Crafting', 'DIY', 'Dancing', 'Fashion', 'Fitness', 'Foodie', 'Gaming', 'Gardening', 'Hiking', 'History', 'Investing', 'K-pop', 'Languages', 'MMA', 'Makeup', 'Meditation', 'Memes', 'Motorcycling', 'Movies', 'Music', 'Painting', 'Parenting', 'Pets', 'Photography', 'Podcasts', 'Poetry', 'Politics', 'Reading', 'Running', 'Skating', 'Sneaker Culture', 'Social Activism', 'Spirituality', 'Stand-up Comedy', 'Startups', 'Tattoos', 'Tech', 'Traveling', 'Writing', 'Yoga']


In [259]:
interest_map = {
    'Fitness': 'health',
    'Yoga': 'health',
    'Meditation': 'health',
    'Running': 'health',
    'Hiking': 'health',
    'MMA': 'health',
    'Skating': 'health',
    'Dancing': 'health',
    
    'Painting': 'creativity',
    'Writing': 'creativity',
    'Poetry': 'creativity',
    'Art': 'creativity',
    'Crafting': 'creativity',
    'DIY': 'creativity',
    'Photography': 'creativity',
    'Fashion': 'creativity',
    'Makeup': 'creativity',
    'Tattoos': 'creativity',
    
    
    'Movies': 'entertainment',
    'Music': 'entertainment',
    'Podcasts': 'entertainment',
    'Stand-up Comedy': 'entertainment',
    'Memes': 'entertainment',
    'Binge-Watching': 'entertainment',
    'Anime': 'entertainment',
    'K-pop': 'entertainment',
    'Gaming': 'entertainment',
    'Board Games': 'entertainment',
    
    'Reading': 'education_culture',
    'History': 'education_culture',
    'Languages': 'education_culture',
    'Politics': 'education_culture',
    'Coding': 'education_culture',
    'Tech': 'education_culture',
    'Startups': 'education_culture',
    'Investing': 'education_culture',

    'Foodie': 'lifestyle',
    'Traveling': 'lifestyle',
    'Pets': 'lifestyle',
    'Parenting': 'lifestyle',
    'Sneaker Culture': 'lifestyle',
    'Gardening': 'lifestyle',
    'Cooking': 'lifestyle',
    'Cars': 'lifestyle',
    'Motorcycling': 'lifestyle',

    'Spirituality': 'social',
    'Astrology': 'social',
    'Social Activism': 'social',
    'Clubbing': 'social',
}

In [260]:
def grouping_interests(row):
    tags = [t.strip() for t in row.split(',')]
    categories = [interest_map.get(tag, 'Other') for tag in tags]
    return categories

df['interest_categories'] = df['interest_tags'].apply(grouping_interests)

category_list = list(set(interest_map.values()))
for cat in category_list:
    df[f'interest_{cat}'] = df['interest_categories'].apply(lambda x: x.count(cat))

for cat in category_list:
    df[f'interest_{cat}'] = df[f'interest_{cat}'] / 3


In [261]:
other_features = ['age','bio_length', 'likes_received', 'emoji_usage_rate','message_sent_count',
                  'interest_lifestyle', 'interest_health', 	'interest_creativity'	,'interest_education_culture',
                    'interest_entertainment','interest_social']
features_v1 = features_v1 + other_features
df[features_v1].head(7)

Unnamed: 0,female,gender_not_specified,male,not_cis,asexual,bi/pan/demi_sexual/queer,heterosexual,homosexual,income,romantic_relationship_intent,education_level_grouped,location_density,air_zodiac_sign,earth_zodiac_sign,fire_zodiac_sign,water_zodiac_sign,age,bio_length,likes_received,emoji_usage_rate,message_sent_count,interest_lifestyle,interest_health,interest_creativity,interest_education_culture,interest_entertainment,interest_social
0,0,1,0,0,0,0,0,1,2,0,1,2,0,1,0,0,56,44,173,0.36,75,0.333333,0.333333,0.0,0.333333,0.0,0.0
1,0,0,1,0,0,1,0,0,1,1,0,1,0,0,1,0,40,301,107,0.42,35,0.333333,0.0,0.333333,0.333333,0.0,0.0
2,0,0,0,1,0,1,0,0,0,2,2,1,0,0,1,0,30,309,91,0.41,33,0.0,0.0,0.333333,0.333333,0.333333,0.0
3,0,0,0,1,0,0,0,1,0,1,2,2,0,1,0,0,57,35,147,0.07,5,0.0,0.0,0.0,0.666667,0.333333,0.0
4,0,0,1,0,0,1,0,0,1,1,1,2,1,0,0,0,24,343,94,0.11,34,0.333333,0.0,0.0,0.0,0.333333,0.333333
5,0,0,0,1,0,0,0,1,2,1,1,1,0,1,0,0,39,112,163,0.18,87,0.0,0.0,0.333333,0.0,0.333333,0.333333
6,0,0,1,0,1,0,0,0,1,2,0,2,0,1,0,0,44,202,54,0.32,82,0.333333,0.0,0.333333,0.333333,0.0,0.0


In [262]:
X_df = pd.DataFrame(df[features_v1], columns=features_v1)
X_df.to_csv('../data/preprocesseddf.csv', index=False)