In [3]:
import numpy as np
import pandas as pd

In [15]:
df = pd.read_csv("./data/vso_ratataou_ace_mock_data.csv")

print(f"There are {len(df.columns)} columns")
df.columns


There are 33 columns


Index(['Name', 'Pronouns', 'UFL Email', 'Phone', 'Socials', 'Year', 'Major',
       'Other Orgs', 'Role (0=Big,1=Little)', 'Preferred Littles',
       'Preferred Bigs', 'Pairing Requests (Optional)',
       'On/Off Campus (0=On,1=Off)', 'Has Car (0=No,1=Yes)',
       'Ideal Big/Little', 'Looking For ACE', 'Free Time', 'Hobbies',
       'Favorite Artists/Songs', 'Icks', 'Talk for Hours About',
       'Self Description', 'Best Joke', 'Favorite Food',
       'EarlyBird/NightOwl (0=Early,1=Night)', 'Extroversion (1-5)',
       'Good Advice (1-5)', 'Plans Style (1-5)', 'Study Frequency (1-5)',
       'Gym Frequency (1-5)', 'Spending Habits (1-5)', 'Friday Night',
       'Additional Info (Optional)'],
      dtype='object')

In [17]:
rename_map = {
    'Name': 'name',
    'Pronouns': 'pronouns',
    'UFL Email': 'ufl_email',
    'Phone': 'phone',
    'Socials': 'socials',
    'Year': 'year',
    'Major': 'major',
    'Other Orgs': 'other_orgs',
    'Role (0=Big,1=Little)': 'role',
    'Preferred Littles': 'preferred_littles',
    'Preferred Bigs': 'preferred_bigs',
    'Pairing Requests (Optional)': 'pairing_requests',
    'On/Off Campus (0=On,1=Off)': 'on_off_campus',
    'Has Car (0=No,1=Yes)': 'has_car',
    'Ideal Big/Little': 'ideal_big_little',
    'Looking For ACE': 'looking_for_ace',
    'Free Time': 'free_time',
    'Hobbies': 'hobbies',
    'Favorite Artists/Songs': 'favorite_artists_songs',
    'Icks': 'dislikes',
    'Talk for Hours About': 'talk_for_hours_about',
    'Self Description': 'self_description',
    'Best Joke': 'best_joke',
    'Favorite Food': 'favorite_food',
    'EarlyBird/NightOwl (0=Early,1=Night)': 'earlybird_nightowl',
    'Extroversion (1-5)': 'extroversion',
    'Good Advice (1-5)': 'good_advice',
    'Plans Style (1-5)': 'plans_style',
    'Study Frequency (1-5)': 'study_frequency',
    'Gym Frequency (1-5)': 'gym_frequency',
    'Spending Habits (1-5)': 'spending_habits',
    'Friday Night': 'friday_night',
    'Additional Info (Optional)': 'additional_info'
}
df = df.rename(columns=rename_map)
df.columns


Index(['name', 'pronouns', 'ufl_email', 'phone', 'socials', 'year', 'major',
       'other_orgs', 'role', 'preferred_littles', 'preferred_bigs',
       'pairing_requests', 'on_off_campus', 'has_car', 'ideal_big_little',
       'looking_for_ace', 'free_time', 'hobbies', 'favorite_artists_songs',
       'dislikes', 'talk_for_hours_about', 'self_description', 'best_joke',
       'favorite_food', 'earlybird_nightowl', 'extroversion', 'good_advice',
       'plans_style', 'study_frequency', 'gym_frequency', 'spending_habits',
       'friday_night', 'additional_info'],
      dtype='object')

In [None]:
"""
Name
UFL_Email

# Preference Filtering characteristics
hard:
Big/Little
Do Not Pair with

soft -- boost:
Major
Year difference
Org participation

# Feature Engineering -> Sentence-BERT
## Text Features
Free Time 
Hobbies
Self Description
dislikes
Talk for Hours About
Friday Night

## Categorical Features -> Embedding_Layer / One Hot
Major
EarlyBord/NightOwl

## Numerial Features -> Normalized 0-1
Extroversion
Good Advice
Plans Style
Study Frequency
Gym Frequency
Spending Habits

"""

In [None]:
# Handling Duplicate entries by ufl_email -- keeping the latest entry
## Reoders dataframe by index, keep the last entry and drop everything else
df_cleaned = df.sort_index().drop_duplicates(subset=["ufl_email"], keep="last")

# Handling Empty Fields
df_cleaned = df_cleaned.fillna(np.NaN)


147.4313399518569

# Normalizing Numerical Features

In [None]:
def normalize_data(dataframe: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    """
    Normalize data to a 0-1 range

    Args:
        dataframe: a pandas DataFrame
        columns: column names to be normalized

    Return the dataframe with normalized columns
    """
    
    df_out = dataframe.copy() # could remove this to modify in-place
    for col in columns:
        xmin = df_out[col].min()
        xmax = df_out[col].max()
        df_out[col] = (df_out[col] - xmin) / (xmax - xmin)
    
    return df_out

### This is how we normalize data! OR We could use sklearn.preprocessing MinMaxScaler
    

In [28]:
from sklearn.preprocessing import MinMaxScaler

columns_to_scale = ['extroversion', 'good_advice', 'plans_style', 'study_frequency', 'gym_frequency', 'spending_habits']
scaler = MinMaxScaler()

df_cleaned[columns_to_scale] = scaler.fit_transform(df_cleaned[columns_to_scale])

df_cleaned['extroversion']

0       0.666667
1       0.333333
2       0.333333
3       0.666667
4       0.000000
          ...   
1995    1.000000
1996    1.000000
1997    0.666667
1998    0.333333
1999    1.000000
Name: extroversion, Length: 1987, dtype: float64

# Text Features -> One single Sentence-SBERT Embedding

Why do we combine them into one single Embedding?
1. Many embeddings leads to noisier signal, and larger dimensions to work with

2. Multiple embedding lose cross context, where it only sees local patterns, and misses global patterns.

3. SBERT is trained on semantic similarity, and large conherent text

4. N text fields x 768  > 1 text field x 768 -- Model is faster with smaller embedding

In [36]:
text_col_combine = ['free_time', 'hobbies', 'self_description', 'dislikes', 'talk_for_hours_about', 'friday_night', 'additional_info']

# 1. For each column 2. fill NaN with an empty string 3. convert to str 4. join each row with comma as delimiter
df_cleaned['profile_text'] = df_cleaned[text_col_combine].fillna('').astype(str).apply('.'.join, axis=1)
df_cleaned['profile_text']

0       photography.traveling, music, movies.Ahead ten...
1       photography.music, reading.Defense stage fall ...
2       cooking.gaming.Seven hand across anything also...
3       traveling.traveling, photography.Situation dri...
4       cooking.gym, photography.Crime area strategy b...
                              ...                        
1995    gym.gym, movies, music.It explain response mat...
1996    traveling.photography, traveling.Song hear exe...
1997    movies.gaming.Often usually though fire succes...
1998    movies.cooking, movies, photography.Range beha...
1999    photography.traveling, reading, gaming.People ...
Name: profile_text, Length: 1987, dtype: object