In [1]:
import joblib
# Standard libraries
import pandas as pd  # For data manipulation
import numpy as np   # For numerical computations
import re            # For regular expressions (text cleaning)

# Scikit-learn tools
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder  # For scaling and encoding
from sklearn.feature_extraction.text import TfidfVectorizer    # For text vectorization (interests, bio, etc.)
from sklearn.metrics.pairwise import cosine_similarity         # For similarity measurement between text vectors

# Natural Language Processing (NLP) - NLTK
from nltk.corpus import stopwords                              # To remove common stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer         # For stemming and lemmatization
from nltk.tokenize import word_tokenize                        # Tokenizer for breaking text into words

# Text normalization
from unidecode import unidecode                                # To remove accents from text (e.g., café → cafe)

# Geographic distance
from geopy.distance import geodesic                            # To compute distance between two geographic coordinates

# Counter utility for frequency counting
from collections import Counter

# Download necessary NLTK data
import nltk
nltk.download('punkt')       # Needed for word_tokenize
nltk.download('stopwords')   # Needed to remove common stopwords
nltk.download('wordnet')     # Needed for lemmatization

[nltk_data] Downloading package punkt to /Users/truongvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/truongvu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/truongvu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# --- Phase 2: Feature Engineering ---
print("--- Phase 2: Feature Engineering ---")
print("\n--- 2.0 Loading Processed Data from Phase 1 ---")

try:
    # Load the profile and training pair datasets saved from Phase 1
    profiles_df = pd.read_csv("../data/profiles_processed_phase1.csv")
    training_data_df = pd.read_csv("../data/training_pairs_phase1.csv")
    print("Successfully loaded preprocessed data from Phase 1!")

except FileNotFoundError:
    # Handle the case where the expected files are missing
    print("Error: Could not find 'profiles_processed_phase1.csv' or 'training_pairs_phase1.csv'.")
    print("Make sure Phase 1 has been executed and the output files were saved correctly.")

--- Phase 2: Feature Engineering ---

--- 2.0 Loading Processed Data from Phase 1 ---
Successfully loaded preprocessed data from Phase 1!


In [3]:
# --- 2.1 Create User Features ---
print("\n--- 2.1 Creating User Features ---")
user_features_df = profiles_df.copy()

# --- 2.1.1 Handle and Normalize Numerical Features ---
print("\n--- Handling numerical features ---")

# Assume 'age' was calculated in Phase 1
# Now handle 'height' and normalize both 'age' and 'height'

numerical_cols = ['age', 'height']
for col in numerical_cols:
    if user_features_df[col].isnull().any():
        median_val = user_features_df[col].median()
        user_features_df[col].fillna(median_val, inplace=True)
        print(f"Filled missing values in '{col}' with median: {median_val}")

# Normalize 'age' using MinMaxScaler
scaler_age = MinMaxScaler()
user_features_df['age_scaled'] = scaler_age.fit_transform(user_features_df[['age']])

joblib.dump(scaler_age, "../models/scaler_age.joblib")

# Normalize 'height' using MinMaxScaler
scaler_height = MinMaxScaler()
user_features_df['height_scaled'] = scaler_height.fit_transform(user_features_df[['height']])
joblib.dump(scaler_height, "../models/scaler_height.joblib")

print("Successfully normalized 'age' and 'height'.")


--- 2.1 Creating User Features ---

--- Handling numerical features ---
Successfully normalized 'age' and 'height'.


In [4]:
# --- 2.1.2 Encode Categorical Features ---
print("\n--- Encoding categorical features ---")

# Define columns to encode using One-Hot Encoding
categorical_cols_onehot = ['sex', 'orientation', 'body_type', 'drink', 'smoke']

# Fill missing values in categorical columns with the mode (most frequent value)
for col in categorical_cols_onehot:
    if user_features_df[col].isnull().any():
        mode_val = user_features_df[col].mode()[0]
        user_features_df[col].fillna(mode_val, inplace=True)
        print(f"Filled missing values in '{col}' with mode: {mode_val}")

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform categorical columns
encoded_cols = onehot_encoder.fit_transform(user_features_df[categorical_cols_onehot])

# Convert encoded numpy array into a DataFrame with proper column names
encoded_df = pd.DataFrame(
    encoded_cols,
    columns=onehot_encoder.get_feature_names_out(categorical_cols_onehot),
    index=user_features_df.index  # Ensure index aligns with original DataFrame
)

# Combine encoded columns with original DataFrame
user_features_df = pd.concat([user_features_df.drop(columns=categorical_cols_onehot), encoded_df], axis=1)
joblib.dump(onehot_encoder, "../models/onehot_encoder_categorical.joblib")

print(f"One-Hot Encoded the following categorical columns: {categorical_cols_onehot}")


--- Encoding categorical features ---
One-Hot Encoded the following categorical columns: ['sex', 'orientation', 'body_type', 'drink', 'smoke']


In [5]:
# --- Encode high-cardinality categorical columns using Top-N One-Hot Encoding ---
print("\n--- Encoding high-cardinality categorical columns ---")

# Utility function: Encode Top-N frequent categories, others grouped into 'other'
def encode_top_n_categorical(df, column, top_n=10, prefix=None):
    """
    Encodes a categorical column using one-hot encoding for the top N most frequent values.
    Remaining categories are grouped under a column '<prefix>_other'.

    Parameters:
        df (DataFrame): Input DataFrame
        column (str): Column to encode
        top_n (int): Number of top frequent categories to keep
        prefix (str): Prefix for the new columns

    Returns:
        DataFrame: Updated DataFrame with encoded columns
    """
    # Count frequency of each category
    counts = df[column].value_counts()

    # Select Top-N categories
    top_categories = counts.nlargest(top_n).index

    # Encode each top category into a binary column
    for category in top_categories:
        col_name = f"{prefix or column}_{unidecode(str(category)).lower().replace(' ', '_').replace('/', '_').replace('(', '').replace(')', '').replace('.', '')}"
        df[col_name] = (df[column] == category).astype(int)

    # Add binary column for all 'other' categories
    df[f"{prefix or column}_other"] = (~df[column].isin(top_categories)).astype(int)

    # Drop original column
    return df.drop(columns=[column])

# --- Encode 'job' column ---
if 'job' in user_features_df.columns:
    user_features_df['job'] = user_features_df['job'].fillna('unknown')
    # Lấy top categories cho job
    job_counts = user_features_df['job'].value_counts()
    top_n_job_actual = 15 # Giống top_n đã dùng
    top_job_categories_list = job_counts.nlargest(top_n_job_actual).index.tolist()
    joblib.dump(top_job_categories_list, "../models/top_n_job_categories.joblib")
    print(f"Saved top_n_job_categories.joblib (Top {len(top_job_categories_list)} categories)")

    user_features_df = encode_top_n_categorical(user_features_df, 'job', top_n=top_n_job_actual, prefix='job')
    print("Encoded Top-N categories for 'job'.")

# --- Encode 'education_level' column ---
if 'education_level' in user_features_df.columns:
    user_features_df['education_level'] = user_features_df['education_level'].fillna('unknown')
    # Lấy top categories cho education_level
    edu_counts = user_features_df['education_level'].value_counts()
    top_n_edu_actual = 7 # Giống top_n đã dùng
    top_edu_categories_list = edu_counts.nlargest(top_n_edu_actual).index.tolist()
    joblib.dump(top_edu_categories_list, "../models/top_n_edu_categories.joblib")
    print(f"Saved top_n_edu_categories.joblib (Top {len(top_edu_categories_list)} categories)")

    user_features_df = encode_top_n_categorical(user_features_df, 'education_level', top_n=top_n_edu_actual, prefix='edu')
    print("Encoded Top-N categories for 'education_level'.")


--- Encoding high-cardinality categorical columns ---
Saved top_n_job_categories.joblib (Top 14 categories)
Encoded Top-N categories for 'job'.
Saved top_n_edu_categories.joblib (Top 6 categories)
Encoded Top-N categories for 'education_level'.


In [6]:
# --- Handle 'dropped_out_school' as binary indicator ---
if 'dropped_out_school' in user_features_df.columns:
    user_features_df['dropped_out_school'] = user_features_df['dropped_out_school'].fillna(0).astype(int)
    print("Handled missing values in 'dropped_out_school'.")

# --- Handle 'interested_in_new_language' as binary indicator ---
if 'interested_in_new_language' in user_features_df.columns:
    user_features_df['interested_in_new_language'] = user_features_df['interested_in_new_language'].fillna(0).astype(int)
    print("Handled missing values in 'interested_in_new_language'.")

Handled missing values in 'dropped_out_school'.
Handled missing values in 'interested_in_new_language'.


In [7]:
# Load English stopwords (you can extend this list manually for Vietnamese later)
stop_words_en = set(stopwords.words('english'))

# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()

# --- Text preprocessing function ---
def preprocess_text(text, use_stemming=False, use_lemmatization=True):
    """
    Cleans and preprocesses raw text input.

    Parameters:
        text (str): Raw input text
        use_stemming (bool): Whether to apply stemming
        use_lemmatization (bool): Whether to apply lemmatization

    Returns:
        str: Cleaned and preprocessed text
    """
    if pd.isnull(text):
        return ""

    # Normalize text: lowercase, remove accents
    text = unidecode(str(text).lower())

    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords and very short tokens
    tokens = [word for word in tokens if word not in stop_words_en and len(word) > 1]

    # Apply lemmatization or stemming
    if use_lemmatization:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    elif use_stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return " ".join(tokens)

def create_multivalue_binary_features(df, column, separator=',', top_n=20, prefix=None, drop_original=False):
    """
    Converts a multi-valued column into multiple binary columns based on the top-N most frequent items.

    Parameters:
        df (pd.DataFrame): Input DataFrame
        column (str): Name of the column to process
        separator (str): Delimiter used to split multi-values (e.g. ',', ';', '/')
        top_n (int): Number of most frequent items to encode
        prefix (str): Prefix to use for new columns (optional)
        drop_original (bool): Whether to drop the original column after encoding

    Returns:
        pd.DataFrame: DataFrame with binary features added
    """

    # Collect all individual items from the column
    all_items = []
    df[column].dropna().apply(
        lambda x: all_items.extend([
            unidecode(item.strip().lower())
            for item in str(x).split(separator) if item.strip()
        ])
    )

    # Count frequency and keep top-N most common items
    item_counts = Counter(all_items)
    top_items = [item for item, _ in item_counts.most_common(top_n)]

    # Create a binary column for each top item
    for item in top_items:
        clean_item = re.sub(r'\W+', '_', item)  # Sanitize column name
        new_col_name = f"{prefix or column}_{clean_item}"
        df[new_col_name] = df[column].apply(
            lambda x: 1 if pd.notnull(x) and item in [
                unidecode(i.strip().lower())
                for i in str(x).split(separator)
            ] else 0
        )

    # Optionally drop the original column
    if drop_original:
        df.drop(columns=[column], inplace=True)

    return df

In [8]:
print("\n--- Processing multi-value text features ---")

# Process 'interests'
if 'interests' in user_features_df.columns:
    user_features_df['interests'] = user_features_df['interests'].fillna('')
    # Lấy top items cho interests
    all_interests_items = []
    user_features_df['interests'].dropna().apply(
        lambda x: all_interests_items.extend([
            unidecode(item.strip().lower())
            for item in str(x).split('-') if item.strip() # Dùng separator='-'
        ])
    )
    interest_counts = Counter(all_interests_items)
    top_n_interests_actual = 30 # Giống top_n đã dùng
    top_interests_items_list = [item for item, _ in interest_counts.most_common(top_n_interests_actual)]
    joblib.dump(top_interests_items_list, "../models/top_interests_items.joblib")
    print(f"Saved top_interests_items.joblib (Top {len(top_interests_items_list)} items)")

    user_features_df = create_multivalue_binary_features(
        user_features_df, column='interests', separator='-',
        top_n=top_n_interests_actual, prefix='interest'
    )
    print("Binary features created for 'interests'.")

# Process 'languages'
if 'languages' in user_features_df.columns:
    user_features_df['languages'] = user_features_df['languages'].fillna('')
    # Lấy top items cho languages
    all_languages_items = []
    user_features_df['languages'].dropna().apply(
        lambda x: all_languages_items.extend([
            unidecode(item.strip().lower())
            for item in str(x).split('-') if item.strip()
        ])
    )
    language_counts = Counter(all_languages_items)
    top_n_languages_actual = 15 # Giống top_n đã dùng
    top_languages_items_list = [item for item, _ in language_counts.most_common(top_n_languages_actual)]
    joblib.dump(top_languages_items_list, "../models/top_languages_items.joblib")
    print(f"Saved top_languages_items.joblib (Top {len(top_languages_items_list)} items)")

    user_features_df = create_multivalue_binary_features(
        user_features_df, column='languages', separator='-',
        top_n=top_n_languages_actual, prefix='lang'
    )
    print("Binary features created for 'languages'.")

# Process 'pets'
if 'pets' in user_features_df.columns:
    user_features_df['pets'] = user_features_df['pets'].fillna('')
    # Lấy top items cho pets
    all_pets_items = []
    user_features_df['pets'].dropna().apply(
        lambda x: all_pets_items.extend([
            unidecode(item.strip().lower())
            for item in str(x).split('-') if item.strip()
        ])
    )
    pet_counts = Counter(all_pets_items)
    top_n_pets_actual = 10 # Giống top_n đã dùng
    top_pets_items_list = [item for item, _ in pet_counts.most_common(top_n_pets_actual)]
    joblib.dump(top_pets_items_list, "../models/top_pets_items.joblib")
    print(f"Saved top_pets_items.joblib (Top {len(top_pets_items_list)} items)")

    user_features_df = create_multivalue_binary_features(
        user_features_df, column='pets', separator='-',
        top_n=top_n_pets_actual, prefix='pet'
    )
    print("Binary features created for 'pets'.")


--- Processing multi-value text features ---
Saved top_interests_items.joblib (Top 10 items)
Binary features created for 'interests'.
Saved top_languages_items.joblib (Top 15 items)
Binary features created for 'languages'.
Saved top_pets_items.joblib (Top 9 items)
Binary features created for 'pets'.


In [9]:
# Process 'bio' text using TF-IDF vectorization (limited to top 100 features including unigrams and bigrams)
if 'bio' in user_features_df.columns:
    print("\n--- Generating TF-IDF features from 'bio' ---")

    # Preprocess the text (e.g. lowercasing, removing punctuation, lemmatization)
    user_features_df['processed_bio'] = user_features_df['bio'].apply(preprocess_text)

    # Initialize TF-IDF Vectorizer with a limit of 100 features and n-grams (1,2)
    tfidf_vectorizer_bio = TfidfVectorizer(max_features=100, ngram_range=(1, 2))

    # Vectorize the processed 'bio' text
    bio_tfidf_matrix = tfidf_vectorizer_bio.fit_transform(user_features_df['processed_bio'])

    # Convert the sparse matrix to a DataFrame with meaningful column names
    bio_tfidf_df = pd.DataFrame(
        bio_tfidf_matrix.toarray(),
        columns=[f"bio_tfidf_{i}" for i in range(bio_tfidf_matrix.shape[1])]
    )

    # Combine the TF-IDF features with the user feature DataFrame
    user_features_df = pd.concat([user_features_df.reset_index(drop=True), bio_tfidf_df.reset_index(drop=True)], axis=1)

    # --- Lưu TfidfVectorizer ---
    joblib.dump(tfidf_vectorizer_bio, "../models/tfidf_vectorizer_bio.joblib")
    print("Saved tfidf_vectorizer_bio.joblib")

    print("TF-IDF features extracted from 'bio' and added to the dataset.")


--- Generating TF-IDF features from 'bio' ---
Saved tfidf_vectorizer_bio.joblib
TF-IDF features extracted from 'bio' and added to the dataset.


In [10]:
# --- 2.1.4. Process Geographic Features ---
print("\n--- Processing Geographic Features ---")

# Handle 'location_preference' feature
# Value -1 usually means 'no restriction' (e.g. willing to match anywhere)
if 'location_preference' in user_features_df.columns:
    # Create a binary flag for 'everywhere' preference
    user_features_df['loc_pref_is_everywhere'] = (user_features_df['location_preference'] == -1).astype(int)

    # Replace -1 with 0 or another logical value for distance (e.g., max distance)
    user_features_df['location_preference_km'] = user_features_df['location_preference'].apply(
        lambda x: 0 if x == -1 else x
    )

    # Normalize location preference distance
    scaler_loc_pref = MinMaxScaler()
    user_features_df['location_preference_km_scaled'] = scaler_loc_pref.fit_transform(
        user_features_df[['location_preference_km']]
    )
    joblib.dump(scaler_loc_pref, "../models/location_preference_scaler.joblib")
    print("Finished processing 'location_preference'.")


--- Processing Geographic Features ---
Finished processing 'location_preference'.


In [11]:
# Scale latitude and longitude features if they exist
# Usually these raw lat/lon values are less meaningful alone,
# but scaling them can be useful if used directly as user features
if 'latitude' in user_features_df.columns and 'longitude' in user_features_df.columns:
    scaler_lat = MinMaxScaler()
    user_features_df['latitude_scaled'] = scaler_lat.fit_transform(user_features_df[['latitude']])
    joblib.dump(scaler_lat, "../models/latitude_scaler.joblib")

    scaler_lon = MinMaxScaler()
    user_features_df['longitude_scaled'] = scaler_lon.fit_transform(user_features_df[['longitude']])
    joblib.dump(scaler_lon, "../models/longitude_scaler.joblib")

    print("Scaled 'latitude' and 'longitude'.")

Scaled 'latitude' and 'longitude'.


In [12]:
# Select final user feature columns by dropping raw or unnecessary columns
# These columns are either original raw data, personal info, or text fields
cols_to_drop_from_user_features = [
    'username', 'password', 'email', 'phone_number', 'first_name', 'last_name',
    'date_of_birth', 'height', 'age', 'bio', 'processed_bio',
    'interests', 'languages', 'pets', 'country', 'state', 'city',  # not used directly
    'latitude', 'longitude', 'location_preference', 'location_preference_km'
]

# If 'job' and 'education_level' columns exist in original profiles_df,
# drop them here because they've been encoded as top-N features
if 'job' in profiles_df.columns:
    cols_to_drop_from_user_features.append('job')
if 'education_level' in profiles_df.columns:
    cols_to_drop_from_user_features.append('education_level')

# Drop specified columns, ignoring errors if some columns don't exist
user_features_final_df = user_features_df.drop(columns=cols_to_drop_from_user_features, errors='ignore')

# Set 'id' column as index for easy lookup and merging later
user_features_final_df.set_index('id', inplace=True)

# --- Lưu danh sách cột User Features Final ---
user_features_final_columns_list = user_features_final_df.columns.tolist()
joblib.dump(user_features_final_columns_list, "../models/user_features_final_columns.joblib")
print("Saved user_features_final_columns.joblib")

print("\n--- Final User Features Columns ---")
print(user_features_final_df.info())
print(user_features_final_df.head())

# Save the engineered user features for later use in pairwise feature creation
user_features_final_df.to_csv("../data/user_features_engineered.csv")
print("Saved user_features_engineered.csv")

Saved user_features_final_columns.joblib

--- Final User Features Columns ---
<class 'pandas.core.frame.DataFrame'>
Index: 2001 entries, 1 to 2001
Columns: 187 entries, interested_in_new_language to longitude_scaled
dtypes: float64(129), int64(58)
memory usage: 2.9 MB
None
    interested_in_new_language  dropped_out_school  age_scaled  height_scaled  \
id                                                                              
1                            1                   0    0.173077       0.392857   
2                            1                   0    0.153846       0.464286   
3                            1                   0    0.057692       0.500000   
4                            0                   0    0.192308       0.464286   
5                            0                   1    0.096154       0.464286   

    sex_female  sex_male  sex_non-binary  sex_prefer not to say  \
id                                                                
1          1.0       0.0

In [13]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the Haversine distance (in kilometers) between two geographic points
    specified by their latitude and longitude.

    Parameters:
    - lat1, lon1: latitude and longitude of point 1
    - lat2, lon2: latitude and longitude of point 2

    Returns:
    - Distance in kilometers as float
    - np.nan if any coordinate is missing
    """
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan  # Return NaN if any coordinate is missing
    return geodesic((lat1, lon1), (lat2, lon2)).km


def orientation_compatibility(sex1, orientation1, sex2, orientation2):
    """
    Determine if two users' sexual orientations are compatible.

    The logic is:
    - User1 is interested in User2 if User2's sex matches User1's orientation preference.
    - User2 is interested in User1 if User1's sex matches User2's orientation preference.
    - If both users are interested in each other, they are considered compatible.
    - Special handling if any value is 'prefer not to say'.

    Parameters:
    - sex1, sex2: string values of users' sex (e.g., 'male', 'female', 'non-binary', etc.)
    - orientation1, orientation2: string values of users' sexual orientation
      (e.g., 'straight', 'homosexual', 'bisexual', 'prefer not to say')

    Returns:
    - True if compatible, False otherwise
    """

    def is_interested(sex_a, orientation_a, sex_b):
        """Helper to check if user A is interested in user B based on sex and orientation."""
        if orientation_a == 'straight':
            # Straight males interested in females and vice versa
            if (sex_a == 'male' and sex_b == 'female') or (sex_a == 'female' and sex_b == 'male'):
                return True
            # Non-binary straight users interested in non-non-binary partners
            if sex_a == 'non-binary' and sex_b != 'non-binary':
                return True
            return False
        elif orientation_a == 'homosexual':
            # Interested if same sex
            return sex_a == sex_b
        elif orientation_a == 'bisexual':
            # Interested in all sexes
            return True
        # Could add more orientations here
        return False

    # Check if either user prefers not to disclose, treat bisexual and prefer not to say as compatible
    if ('prefer not to say' in {sex1, sex2, orientation1, orientation2}):
        if (orientation1 in ['bisexual', 'prefer not to say'] and
            orientation2 in ['bisexual', 'prefer not to say']):
            return True
        else:
            return False

    # Check mutual interest
    user1_likes_user2 = is_interested(sex1, orientation1, sex2)
    user2_likes_user1 = is_interested(sex2, orientation2, sex1)

    return user1_likes_user2 and user2_likes_user1


def jaccard_similarity(list1_str, list2_str, separator='-'):
    """
    Calculate Jaccard similarity between two multi-valued string features.

    Each string contains multiple values separated by a separator (e.g., '-').
    This function converts them into sets of normalized tokens and computes similarity.

    Parameters:
    - list1_str, list2_str: strings of multi-valued features, e.g., 'hiking-swimming-movies'
    - separator: character separating the values in the string

    Returns:
    - Jaccard similarity as a float between 0 and 1
    """
    if pd.isna(list1_str) or pd.isna(list2_str):
        return 0.0

    set1 = set(item.strip().lower() for item in str(list1_str).split(separator) if item.strip())
    set2 = set(item.strip().lower() for item in str(list2_str).split(separator) if item.strip())

    if not set1 and not set2:
        # If both sets empty, similarity could be defined as 1 or 0
        return 0.0

    intersection_size = len(set1.intersection(set2))
    union_size = len(set1.union(set2))

    return intersection_size / union_size if union_size != 0 else 0.0

In [14]:
print("\n--- 2.2 Creating Pairwise Features ---")

pairwise_features_list = []

# Convert profiles_df into a dictionary for fast access by user id
profiles_info_dict = profiles_df.set_index('id').to_dict('index')

# Ensure user_features_final_df is indexed by 'id' for quick lookup
# If not, uncomment the next line:
# user_features_final_df = user_features_final_df.set_index('id')

print(f"Generating pairwise features for {len(training_data_df)} pairs...")

for index, row in training_data_df.iterrows():
    user1_id = int(row['user1'])
    user2_id = int(row['user2'])
    target = int(row['target'])

    user1_profile = profiles_info_dict.get(user1_id)
    user2_profile = profiles_info_dict.get(user2_id)

    if user1_profile is None or user2_profile is None:
        print(f"Warning: Missing profile for user {user1_id} or {user2_id}. Skipping this pair.")
        continue

    pair_features = {
        'user1': user1_id,
        'user2': user2_id,
        'target': target
    }

    # 1. Basic differences or similarities: absolute difference in age, height
    pair_features['age_diff'] = abs(user1_profile.get('age', 0) - user2_profile.get('age', 0))
    pair_features['height_diff'] = abs(user1_profile.get('height', 0) - user2_profile.get('height', 0))

    # 2. Geographical distance (in kilometers) between users
    pair_features['geo_distance_km'] = haversine_distance(
        user1_profile.get('latitude'), user1_profile.get('longitude'),
        user2_profile.get('latitude'), user2_profile.get('longitude')
    )
    # Handle missing geo distances by assigning a large default value (e.g., 10,000 km)
    if pd.isna(pair_features['geo_distance_km']):
        pair_features['geo_distance_km'] = 10000.0

    # 3. Location preference compatibility:
    # Assume location_preference stores max distance user prefers (-1 means no preference)
    user1_loc_pref = user1_profile.get('location_preference', -1)
    user2_loc_pref = user2_profile.get('location_preference', -1)
    dist = pair_features['geo_distance_km']

    pair_features['user1_within_user2_loc_pref'] = int(user2_loc_pref == -1 or (dist is not None and dist <= user2_loc_pref))
    pair_features['user2_within_user1_loc_pref'] = int(user1_loc_pref == -1 or (dist is not None and dist <= user1_loc_pref))

    # 4. Sexual orientation compatibility (both directions)
    pair_features['orientation_compatible_user1_to_user2'] = orientation_compatibility(
        user1_profile.get('sex'), user1_profile.get('orientation'),
        user2_profile.get('sex'), user2_profile.get('orientation')
    )
    pair_features['orientation_compatible_user2_to_user1'] = orientation_compatibility(
        user2_profile.get('sex'), user2_profile.get('orientation'),
        user1_profile.get('sex'), user1_profile.get('orientation')
    )
    # Final compatibility flag: True if either direction is compatible
    pair_features['orientation_compatible_final'] = max(
        pair_features['orientation_compatible_user1_to_user2'],
        pair_features['orientation_compatible_user2_to_user1']
    )

    # 5. Similar habits: drinking and smoking match (binary)
    pair_features['drink_match'] = int(user1_profile.get('drink') == user2_profile.get('drink'))
    pair_features['smoke_match'] = int(user1_profile.get('smoke') == user2_profile.get('smoke'))

    # 6. Education level match (simple binary: 1 if equal, else 0)
    pair_features['education_match'] = int(user1_profile.get('education_level') == user2_profile.get('education_level'))

    # 7. Jaccard similarity for interests
    pair_features['interests_jaccard'] = jaccard_similarity(
        user1_profile.get('interests'), user2_profile.get('interests'), separator='-'
    )

    # 8. Jaccard similarity for languages and shared interest in learning new languages
    pair_features['languages_jaccard'] = jaccard_similarity(
        user1_profile.get('languages'), user2_profile.get('languages'), separator='-'
    )
    pair_features['user1_wants_learn_lang'] = int(user1_profile.get('interested_in_new_language', 0))
    pair_features['user2_wants_learn_lang'] = int(user2_profile.get('interested_in_new_language', 0))
    pair_features['language_interest_match'] = int(pair_features['user1_wants_learn_lang'] == 1 and pair_features['user2_wants_learn_lang'] == 1)

    # 9. Jaccard similarity for pets
    pair_features['pets_jaccard'] = jaccard_similarity(
        user1_profile.get('pets'), user2_profile.get('pets'), separator='-'
    )

    # 10. Similarity of user feature vectors (cosine similarity and mean absolute error)
    try:
        vec1 = user_features_final_df.loc[user1_id].values.reshape(1, -1)
        vec2 = user_features_final_df.loc[user2_id].values.reshape(1, -1)

        # Replace NaNs with zero for similarity computation
        vec1 = np.nan_to_num(vec1, nan=0.0)
        vec2 = np.nan_to_num(vec2, nan=0.0)

        pair_features['user_features_cosine_sim'] = cosine_similarity(vec1, vec2)[0, 0]
        pair_features['user_features_mae_diff'] = np.mean(np.abs(vec1 - vec2))
    except KeyError:
        print(f"Warning: Missing feature vector for user {user1_id} or {user2_id}. Using default similarity values.")
        pair_features['user_features_cosine_sim'] = 0.0
        pair_features['user_features_mae_diff'] = 1.0  # Max diff as fallback
    except Exception as e:
        print(f"Error computing feature vector similarity for pair ({user1_id}, {user2_id}): {e}")
        pair_features['user_features_cosine_sim'] = 0.0
        pair_features['user_features_mae_diff'] = 1.0

    pairwise_features_list.append(pair_features)

    # Progress print every 5000 pairs
    if (index + 1) % 5000 == 0:
        print(f"Processed {index + 1} / {len(training_data_df)} pairs...")


--- 2.2 Creating Pairwise Features ---
Generating pairwise features for 49758 pairs...
Processed 5000 / 49758 pairs...
Processed 10000 / 49758 pairs...
Processed 15000 / 49758 pairs...
Processed 20000 / 49758 pairs...
Processed 25000 / 49758 pairs...
Processed 30000 / 49758 pairs...
Processed 35000 / 49758 pairs...
Processed 40000 / 49758 pairs...
Processed 45000 / 49758 pairs...


In [15]:
# Convert the list of pairwise feature dictionaries to a DataFrame
pairwise_features_df = pd.DataFrame(pairwise_features_list)

pairwise_model_input_columns_list = [
    col for col in pairwise_features_df.columns if col not in ['user1', 'user2', 'target']
]
joblib.dump(pairwise_model_input_columns_list, "../models/pairwise_model_input_columns.joblib")

print("\n--- First 5 rows of the pairwise features DataFrame ---")
print(pairwise_features_df.head())


--- First 5 rows of the pairwise features DataFrame ---
   user1  user2  target  age_diff  height_diff  geo_distance_km  \
0    259    961       0        18          6.0        22.675100   
1    260    939       1        12          5.0        18.469857   
2    927     62       0        11          6.0        19.247655   
3    415   1353       0         4          1.0        15.782432   
4    293   1667       1         7          7.0        14.220504   

   user1_within_user2_loc_pref  user2_within_user1_loc_pref  \
0                            1                            1   
1                            1                            1   
2                            1                            1   
3                            1                            1   
4                            1                            1   

   orientation_compatible_user1_to_user2  \
0                                   True   
1                                   True   
2                            

In [16]:
# Normalize pairwise features (excluding 'user1', 'user2', and 'target' columns)

# Identify columns to scale: all except 'user1', 'user2', 'target'
cols_to_scale_pairwise = [col for col in pairwise_features_df.columns if col not in ['user1', 'user2', 'target']]

# From these columns, select only the numerical ones for scaling
numerical_cols_to_scale_pairwise = pairwise_features_df[cols_to_scale_pairwise].select_dtypes(include=np.number).columns

if not numerical_cols_to_scale_pairwise.empty:
    # Initialize a scaler (MinMaxScaler is used here; alternatively, StandardScaler can be used)
    scaler_pairwise = MinMaxScaler()

    # Fit the scaler on the numerical columns and transform the data accordingly
    pairwise_features_df[numerical_cols_to_scale_pairwise] = scaler_pairwise.fit_transform(pairwise_features_df[numerical_cols_to_scale_pairwise])
    joblib.dump(scaler_pairwise, "../models/pairwise_features_scaler.joblib")

    print(f"Normalized numerical columns in pairwise_features_df: {list(numerical_cols_to_scale_pairwise)}")
else:
    print("No numerical columns to normalize in pairwise_features_df (excluding 'user1', 'user2', and 'target').")

Normalized numerical columns in pairwise_features_df: ['age_diff', 'height_diff', 'geo_distance_km', 'user1_within_user2_loc_pref', 'user2_within_user1_loc_pref', 'drink_match', 'smoke_match', 'education_match', 'interests_jaccard', 'languages_jaccard', 'user1_wants_learn_lang', 'user2_wants_learn_lang', 'language_interest_match', 'pets_jaccard', 'user_features_cosine_sim', 'user_features_mae_diff']


In [17]:
print("\n--- Information of pairwise_features_df after handling NaNs and normalization ---")
# Display summary info of the DataFrame, including number of non-null entries and data types
pairwise_features_df.info()

# Display first few rows of the DataFrame for a quick look at the data
print(pairwise_features_df.head())

# Save the processed pairwise features DataFrame to a CSV file
pairwise_features_df.to_csv("../data/pairwise_features_engineered.csv", index=False)
print("Saved pairwise_features_engineered.csv")

print("\n--- Phase 2 Completed ---")
print("Next step: Phase 3 - Model Building and Training")
print("The pairwise_features_df dataset is now ready for model training.")



--- Information of pairwise_features_df after handling NaNs and normalization ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49758 entries, 0 to 49757
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   user1                                  49758 non-null  int64  
 1   user2                                  49758 non-null  int64  
 2   target                                 49758 non-null  int64  
 3   age_diff                               49758 non-null  float64
 4   height_diff                            49758 non-null  float64
 5   geo_distance_km                        49758 non-null  float64
 6   user1_within_user2_loc_pref            49758 non-null  float64
 7   user2_within_user1_loc_pref            49758 non-null  float64
 8   orientation_compatible_user1_to_user2  49758 non-null  bool   
 9   orientation_compatible_user2_to_user1  49758 non-null  