In [23]:
import numpy as np
import pandas as pd
from embedding_pipeline import EmbeddingPipeline
from feature_engineer import FeatureEngineer
import matplotlib.pyplot as plt


In [20]:
df = pd.read_csv("./vso_ratataou_ace_mock_data.csv")
print(f"{df.shape[0]} Applicants" , f"{df.shape[1]} Columns")
df.head()


2000 Applicants 33 Columns


Unnamed: 0,Name,Pronouns,UFL Email,Phone,Socials,Year,Major,Other Orgs,"Role (0=Big,1=Little)",Preferred Littles,...,Favorite Food,"EarlyBird/NightOwl (0=Early,1=Night)",Extroversion (1-5),Good Advice (1-5),Plans Style (1-5),Study Frequency (1-5),Gym Frequency (1-5),Spending Habits (1-5),Friday Night,Additional Info (Optional)
0,Taylor Jarvis,they/them,daniel910@ufl.edu,(673)774-0860,@jeffrey63,0,Finance,VSO,1,,...,I would literally eat anything,1,4,4,3,3,1,3,gaming all night,I love hanging out at Plaza of Americas
1,Corey Knox,she/her,gabriel680@ufl.edu,1713894873,@chris19,0,Mechanical Engineering,FSA,1,,...,Asian,0,3,3,3,4,2,2,gaming all night,I love hanging out at Plaza of Americas
2,Mark Reyes,they/them,julie359@ufl.edu,(044)445-6922x353,@tracy38,0,Computer Science,VSO,1,,...,Greek/Mediterranean,1,3,3,3,3,1,5,gaming all night,
3,Kathleen Ballard,she/her,tara913@ufl.edu,119.318.8215,@james40,0,Finance,AAA,1,,...,Greek/Mediterranean,0,4,4,1,3,1,2,gaming all night,
4,Dawn Coleman,they/them,angela816@ufl.edu,931-155-4194x903,@joseph45,0,Data Science,HSA,1,,...,American,1,2,2,2,4,2,1,dinner with friends,


TypeError: bar() missing 1 required positional argument: 'height'

In [3]:
# load our criteria for applicant features
from config import RENAME_MAP, DEFAULT_CATEGORICALS, DEFAULT_NUMERICS, DEFAULT_PROFILE_TEXT
DEFAULT_PROFILE_TEXT

['free_time',
 'hobbies',
 'self_description',
 'icks',
 'talk_for_hours_about',
 'friday_night',
 'additional_info']

In [4]:
# Initiate Feature engineer to standardize our applicant responses & data
FEATURE_ENGINE = FeatureEngineer(
                profile_text= DEFAULT_PROFILE_TEXT,
                categorical_fields= DEFAULT_CATEGORICALS,
                numeric_fields=DEFAULT_NUMERICS
)


In [5]:
df = FEATURE_ENGINE.rename_column(df)

In [6]:
transformed_features = FEATURE_ENGINE.fit_transform(df)
print("-------")
print("features includes:" , transformed_features.keys())

Fitting DataFrame...
Finished Fitting.
Fit Status:  True
Transforming...
Our current DataFrame                name   pronouns           ufl_email              phone  \
0     Taylor Jarvis  they/them   daniel910@ufl.edu      (673)774-0860   
1        Corey Knox    she/her  gabriel680@ufl.edu         1713894873   
2        Mark Reyes  they/them    julie359@ufl.edu  (044)445-6922x353   
3  Kathleen Ballard    she/her     tara913@ufl.edu       119.318.8215   
4      Dawn Coleman  they/them   angela816@ufl.edu   931-155-4194x903   

      socials year                   major other_orgs role preferred_littles  \
0  @jeffrey63    0                 Finance        VSO    1                     
1    @chris19    0  Mechanical Engineering        FSA    1                     
2    @tracy38    0        Computer Science        VSO    1                     
3    @james40    0                 Finance        AAA    1                     
4   @joseph45    0            Data Science        HSA    1        

In [7]:

META_FEATURES = transformed_features['meta_features']
TEXT_EMBEDDINGS =transformed_features['profile_text']

print("Meta Type: ", type(META_FEATURES))
print("Profile text Type: " , type(TEXT_EMBEDDINGS))
type(TEXT_EMBEDDINGS[0])


Meta Type:  <class 'numpy.ndarray'>
Profile text Type:  <class 'numpy.ndarray'>


str

In [8]:
EMBEDDING_PIPELINE = EmbeddingPipeline(sbert_model_name='all-MiniLM-L6-v2')

In [9]:
combined_features = EMBEDDING_PIPELINE.combine_features(meta_features=META_FEATURES,
                                                        text_features=TEXT_EMBEDDINGS)

Encoding Text Features...
Encoding Text Features...
Loading Sentence Transformer Model ...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Combining text embeddings & meta features...


In [11]:
combined_features.shape

(2000, 398)

In [17]:
# try Using SBERT Similarity search
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
similarities = model.similarity(combined_features, combined_features)

similarities

tensor([[ 1.0000,  0.1700,  0.4749,  ...,  0.2712,  0.3014,  0.6285],
        [ 0.1700,  1.0000,  0.1110,  ..., -0.1412,  0.5170, -0.0235],
        [ 0.4749,  0.1110,  1.0000,  ..., -0.0307,  0.5365,  0.6321],
        ...,
        [ 0.2712, -0.1412, -0.0307,  ...,  1.0000, -0.0510, -0.0805],
        [ 0.3014,  0.5170,  0.5365,  ..., -0.0510,  1.0000,  0.2364],
        [ 0.6285, -0.0235,  0.6321,  ..., -0.0805,  0.2364,  1.0000]])

In [18]:
similarities.shape

torch.Size([2000, 2000])