In [26]:
import numpy as np
import pandas as pd
from sentence_transformers import CrossEncoder, SentenceTransformer


In [7]:
df = pd.read_csv("../vso_ratataou_ace_mock_data.csv")

print(f"There are {len(df.columns)} columns")
df.head()


There are 33 columns


Unnamed: 0,Name,Pronouns,UFL Email,Phone,Socials,Year,Major,Other Orgs,"Role (0=Big,1=Little)",Preferred Littles,...,Favorite Food,"EarlyBird/NightOwl (0=Early,1=Night)",Extroversion (1-5),Good Advice (1-5),Plans Style (1-5),Study Frequency (1-5),Gym Frequency (1-5),Spending Habits (1-5),Friday Night,Additional Info (Optional)
0,Taylor Jarvis,they/them,daniel910@ufl.edu,(673)774-0860,@jeffrey63,0,Finance,VSO,1,,...,I would literally eat anything,1,4,4,3,3,1,3,gaming all night,I love hanging out at Plaza of Americas
1,Corey Knox,she/her,gabriel680@ufl.edu,1713894873,@chris19,0,Mechanical Engineering,FSA,1,,...,Asian,0,3,3,3,4,2,2,gaming all night,I love hanging out at Plaza of Americas
2,Mark Reyes,they/them,julie359@ufl.edu,(044)445-6922x353,@tracy38,0,Computer Science,VSO,1,,...,Greek/Mediterranean,1,3,3,3,3,1,5,gaming all night,
3,Kathleen Ballard,she/her,tara913@ufl.edu,119.318.8215,@james40,0,Finance,AAA,1,,...,Greek/Mediterranean,0,4,4,1,3,1,2,gaming all night,
4,Dawn Coleman,they/them,angela816@ufl.edu,931-155-4194x903,@joseph45,0,Data Science,HSA,1,,...,American,1,2,2,2,4,2,1,dinner with friends,


In [8]:
rename_map = {
    'Name': 'name',
    'Pronouns': 'pronouns',
    'UFL Email': 'ufl_email',
    'Phone': 'phone',
    'Socials': 'socials',
    'Year': 'year',
    'Major': 'major',
    'Other Orgs': 'other_orgs',
    'Role (0=Big,1=Little)': 'role',
    'Preferred Littles': 'preferred_littles',
    'Preferred Bigs': 'preferred_bigs',
    'Pairing Requests (Optional)': 'pairing_requests',
    'On/Off Campus (0=On,1=Off)': 'on_off_campus',
    'Has Car (0=No,1=Yes)': 'has_car',
    'Ideal Big/Little': 'ideal_big_little',
    'Looking For ACE': 'looking_for_ace',
    'Free Time': 'free_time',
    'Hobbies': 'hobbies',
    'Favorite Artists/Songs': 'favorite_artists_songs',
    'Icks': 'dislikes',
    'Talk for Hours About': 'talk_for_hours_about',
    'Self Description': 'self_description',
    'Best Joke': 'best_joke',
    'Favorite Food': 'favorite_food',
    'EarlyBird/NightOwl (0=Early,1=Night)': 'earlybird_nightowl',
    'Extroversion (1-5)': 'extroversion',
    'Good Advice (1-5)': 'good_advice',
    'Plans Style (1-5)': 'plans_style',
    'Study Frequency (1-5)': 'study_frequency',
    'Gym Frequency (1-5)': 'gym_frequency',
    'Spending Habits (1-5)': 'spending_habits',
    'Friday Night': 'friday_night',
    'Additional Info (Optional)': 'additional_info'
}
df = df.rename(columns=rename_map)
df.columns


Index(['name', 'pronouns', 'ufl_email', 'phone', 'socials', 'year', 'major',
       'other_orgs', 'role', 'preferred_littles', 'preferred_bigs',
       'pairing_requests', 'on_off_campus', 'has_car', 'ideal_big_little',
       'looking_for_ace', 'free_time', 'hobbies', 'favorite_artists_songs',
       'dislikes', 'talk_for_hours_about', 'self_description', 'best_joke',
       'favorite_food', 'earlybird_nightowl', 'extroversion', 'good_advice',
       'plans_style', 'study_frequency', 'gym_frequency', 'spending_habits',
       'friday_night', 'additional_info'],
      dtype='object')

In [9]:
"""
Name
UFL_Email

# Preference Filtering characteristics
hard:
Big/Little
Do Not Pair with

soft -- boost:
Major
Year difference
Org participation

# Feature Engineering -> Sentence-BERT
## Text Features
Free Time 
Hobbies
Self Description
dislikes
Talk for Hours About
Friday Night

## Categorical Features -> Embedding_Layer / One Hot
Major
EarlyBord/NightOwl

## Numerial Features -> Normalized 0-1
Extroversion
Good Advice
Plans Style
Study Frequency
Gym Frequency
Spending Habits

"""

'\nName\nUFL_Email\n\n# Preference Filtering characteristics\nhard:\nBig/Little\nDo Not Pair with\n\nsoft -- boost:\nMajor\nYear difference\nOrg participation\n\n# Feature Engineering -> Sentence-BERT\n## Text Features\nFree Time \nHobbies\nSelf Description\ndislikes\nTalk for Hours About\nFriday Night\n\n## Categorical Features -> Embedding_Layer / One Hot\nMajor\nEarlyBord/NightOwl\n\n## Numerial Features -> Normalized 0-1\nExtroversion\nGood Advice\nPlans Style\nStudy Frequency\nGym Frequency\nSpending Habits\n\n'

In [10]:
# Handling Duplicate entries by ufl_email -- keeping the latest entry
## Reoders dataframe by index, keep the last entry and drop everything else
df_cleaned = df.sort_index().drop_duplicates(subset=["ufl_email"], keep="last")

# Handling Empty Fields
df_cleaned = df_cleaned.fillna(np.NaN)


# Normalizing Numerical Features

In [11]:
def normalize_data(dataframe: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
    """
    Normalize data to a 0-1 range

    Args:
        dataframe: a pandas DataFrame
        columns: column names to be normalized

    Return the dataframe with normalized columns
    """
    
    df_out = dataframe.copy() # could remove this to modify in-place
    for col in columns:
        xmin = df_out[col].min()
        xmax = df_out[col].max()
        df_out[col] = (df_out[col] - xmin) / (xmax - xmin)
    
    return df_out

### This is how we normalize data! OR We could use sklearn.preprocessing MinMaxScaler
    

In [12]:
from sklearn.preprocessing import MinMaxScaler

columns_to_scale = ['extroversion', 'good_advice', 'plans_style', 'study_frequency', 'gym_frequency', 'spending_habits']
scaler = MinMaxScaler()

df_cleaned[columns_to_scale] = scaler.fit_transform(df_cleaned[columns_to_scale])

df_cleaned['extroversion']

0       0.666667
1       0.333333
2       0.333333
3       0.666667
4       0.000000
          ...   
1995    1.000000
1996    1.000000
1997    0.666667
1998    0.333333
1999    1.000000
Name: extroversion, Length: 1987, dtype: float64

# Text Features -> One single Sentence-SBERT Embedding

Why do we combine them into one single Embedding?
1. Many embeddings leads to noisier signal, and larger dimensions to work with

2. Multiple embedding lose cross context, where it only sees local patterns, and misses global patterns.

3. SBERT is trained on semantic similarity, and large conherent text

4. N text fields x 768  > 1 text field x 768 -- Model is faster with smaller embedding

In [13]:
text_col_combine = ['free_time', 'hobbies', 'self_description', 'dislikes', 'talk_for_hours_about', 'friday_night', 'additional_info']

# 1. For each column 2. fill NaN with an empty string 3. convert to str 4. join each row with comma as delimiter
df_cleaned['profile_text'] = df_cleaned[text_col_combine].fillna('').astype(str).apply('.'.join, axis=1)
df_cleaned['profile_text']

0       photography.traveling, music, movies.Ahead ten...
1       photography.music, reading.Defense stage fall ...
2       cooking.gaming.Seven hand across anything also...
3       traveling.traveling, photography.Situation dri...
4       cooking.gym, photography.Crime area strategy b...
                              ...                        
1995    gym.gym, movies, music.It explain response mat...
1996    traveling.photography, traveling.Song hear exe...
1997    movies.gaming.Often usually though fire succes...
1998    movies.cooking, movies, photography.Range beha...
1999    photography.traveling, reading, gaming.People ...
Name: profile_text, Length: 1987, dtype: object

In [25]:
# 1. Load a pre-trained Sentence model
model = SentenceTransformer("all-MiniLM-L6-v2")


df_cleaned['profile_text'] = df_cleaned['profile_text'].astype(str)
type(df_cleaned['profile_text'])
model.predict(df_cleaned['profile_text'])

pandas.core.series.Series

# Categorical Features
Categorical -> one-hot encode -> vectors

# Input Features
profile_embedding: Semantic Embedding 

categorical_vector: Yes/No for Filtering

numeric_vector: Normalized features on a StandardScale

In [None]:
"""
Purpose:
- Rename and clean columns
- Building a profile_text (biography) from many columns
- categorical encoding
- normalize scaling
"""

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from typing import List, Optional, Dict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


import config

class FeatureEngineer:
    def __init__(
            self,
            profile_text: Optional[List[str]] = None,
            categorical_fields: Optional[list[str]] = None,
            numeric_fields: Optional[List[str]] = None
    ):
        self.profile_text = profile_text
        self.categorical_fields = categorical_fields
        self.numeric_fields = numeric_fields

        # ColumnTransformer Fit
        self.column_transformer: Optional[ColumnTransformer] = None
        self.fitted = False
    
    @staticmethod
    def rename_column(df: pd.DataFrame) -> pd.DataFrame:
        """
        Rename all columns in-place for the given column name map in config.py
        """
        df = df.rename(columns=config.RENAME_MAP)
        
        # lower all columns 
        df.columns = [col.lower() for col in df.columns]
        return df
    
    @staticmethod
    def build_profile_text(df: pd.DataFrame, text_fields: List[str]) -> pd.Series:
        """
        Combine our profile_text fields together by checking cases for each row

        Return:
            A 1D array or pd.Series that includes our profile_text
        """
        def join_row(row) -> str:
            """
            Check each row for:
                1. None values
                2. NaN values 
                3. String Casting
            Return: A Str
            """
            parts = []
            for field in text_fields:
                # verify the field exist, else it's empty str
                candidate_text = row.get(field, "")
                # If NaN -> empty str
                if pd.isna(candidate_text):
                    candidate_text = ""

                candidate_text = str(candidate_text).strip()
                if candidate_text:
                    parts.append(candidate_text)

            if parts:
                return ". ".join(parts)
            else:
                return ""
        
        return (df.apply(join_row, axis=1)) # Axis=1 : apply for each row at O(n) times

    def _clean_table(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Remove NaN, normalize whitespace/trailing spaces & consistent string types

        Return:
            A new copy of cleaned DataFrame
        """
        df_copy = df.copy()
        
        # Standardize all columns to str
        for col in df_copy.columns:
            # Fill empty fields with empty string + str cast
            df_copy[col] = df_copy[col].fillna("").astype(str)
        
        # Numeric coercion for numeric fields
        for col in self.numeric_fields:
            if col in df_copy.columns:
                # Cast integer if possible, else NaN upon TypeError
                df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce', downcast='float')
                df_copy[col] = df_copy[col].fillna(0).astype(float)
        return df_copy

    def fit(self, df: pd.DataFrame):
        """
        Performing preprocessing transformation on profile_text, categorical_fields, numeric_fields
        where we transformed:
            profile_text: : List[str]
            categorical_fields: One Hot Encoding
            numeric_field: Standard Scaler 
        
        Output: 
            update column_transforemr with our given DataFrame with a single matrix
        """
        print("Fitting DataFrame...")

        df = self.rename_column(df)
        df = self._clean_table(df)
        df['profile_text'] = self.build_profile_text(df, self.profile_text)
        
        # Building our ColumnTransformer through Onehot & StandardScaler
        transformers = []

        ##categorical_fields : One Hot Encoidng
        available_category = [c for c in self.categorical_fields if c in df.columns]
        if available_category:
            ohe = OneHotEncoder(handle_unknown='error', sparse_output=False)
            transformers.append(("OneHot" , ohe, available_category))

        ## numeric_fields : StandardScaler | Mean=0, Var=1 
        available_nums = [c for c in self.numeric_fields if c in df.columns]
        if available_nums:
            scaler = StandardScaler()
            transformers.append(('likert_scale', scaler, available_nums))
            
        if not transformers:
            raise ValueError("No categorical or numeric features available to fit ColumnTransformer.")

        self.column_transformer = ColumnTransformer(transformers=transformers, 
                                                    remainder='drop') # Drop other features not mentioned
        x_meta = self.column_transformer.fit_transform(df)
        self.fitted = True
        print("Finished Fitting.")
        print("Fit Status: ", self.fitted)
        return self

    def transform(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        print("Transforming...")

        if not self.fitted or self.column_transformer is None:
            raise RuntimeError("FeatureEngineer must be fitted before transform(). Call fit() first.")

        df = self.rename_column(df)
        df = self._clean_table(df)
        df['profile_text'] = self.build_profile_text(df, self.profile_text)
        
        print("Our current DataFrame", df.head())

        x_meta = self.column_transformer.transform(df)
        # If OneHotEncoder is Sparse -> cast to dense | Empty fields are filled with zeros
        if hasattr(x_meta, "toarray"):
            x_meta = x_meta.toarray()
        
        print("Finished Transforming.")
        return {
            'profile_text': df['profile_text'].tolist(),
            'meta_features': x_meta,
            'index': df.index.to_numpy(),
            'raw_df': df
        }
    def fit_transform(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        self.fit(df)
        return self.transform(df)

In [74]:
testFeature = FeatureEngineer(
    categorical_fields=config.DEFAULT_CATEGORICALS,
    numeric_fields=config.DEFAULT_NUMERICS,
    profile_text=config.DEFAULT_PROFILE_TEXT
)


In [75]:
test = testFeature.fit_transform(df=df)


Fitting DataFrame...
Finished Fitting.
Fit Status:  True
Transforming...
Our current DataFrame                name   pronouns           ufl_email              phone  \
0     Taylor Jarvis  they/them   daniel910@ufl.edu      (673)774-0860   
1        Corey Knox    she/her  gabriel680@ufl.edu         1713894873   
2        Mark Reyes  they/them    julie359@ufl.edu  (044)445-6922x353   
3  Kathleen Ballard    she/her     tara913@ufl.edu       119.318.8215   
4      Dawn Coleman  they/them   angela816@ufl.edu   931-155-4194x903   

      socials year                   major other_orgs role preferred_littles  \
0  @jeffrey63    0                 Finance        VSO    1                     
1    @chris19    0  Mechanical Engineering        FSA    1                     
2    @tracy38    0        Computer Science        VSO    1                     
3    @james40    0                 Finance        AAA    1                     
4   @joseph45    0            Data Science        HSA    1        

In [76]:
test['meta_features']

array([[ 0.        ,  0.        ,  0.        , ..., -0.42983946,
        -1.39758469, -0.00945811],
       [ 0.        ,  0.        ,  0.        , ...,  0.47129149,
        -0.6936891 , -0.71005889],
       [ 0.        ,  1.        ,  0.        , ..., -0.42983946,
        -1.39758469,  1.39174345],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.37242243,
         0.71410207,  0.69114267],
       [ 0.        ,  0.        ,  1.        , ...,  0.47129149,
        -1.39758469,  0.69114267],
       [ 0.        ,  1.        ,  0.        , ..., -1.33097041,
        -1.39758469,  0.69114267]])