# 03 â€“ Feature Engineering and Preprocessing

This notebook:

- Loads `cleaned_responses.csv` from `data/intermediate`
- Encodes survey responses into model ready numeric features
- Builds a user feature matrix `X_users`
- Saves processed data to `data/processed/final_model_dataset.csv`


In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 140)

PROJECT_ROOT = Path.cwd().parent if (Path.cwd() / "data").exists() else Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
INTERMEDIATE_DIR = DATA_DIR / "intermediate"
PROCESSED_DIR = DATA_DIR / "processed"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

clean_file = INTERMEDIATE_DIR / "cleaned_responses.csv"
df = pd.read_csv(clean_file)

df.head()


In [None]:
def encode_age_group(series: pd.Series) -> pd.Series:
    mapping = {
        "18-24": 1,
        "25-34": 2,
        "35-44": 3,
        "45-54": 4,
        "55+": 5,
    }
    return series.map(mapping)


def one_hot(series: pd.Series, prefix: str) -> pd.DataFrame:
    """
    Simple one hot encoding using pandas get_dummies.
    """
    dummies = pd.get_dummies(series, prefix=prefix)
    return dummies


def multi_select_contains(series: pd.Series, options_map: dict) -> pd.DataFrame:
    """
    For multi select questions stored as text (often with commas or semicolons),
    create one binary column per option using case insensitive substring search.

    options_map: dict of {column_name: substring_to_search}
    """
    out = {}
    s = series.fillna("").astype(str).str.lower()
    for col_name, keyword in options_map.items():
        out[col_name] = s.str.contains(keyword.lower(), na=False)
    return pd.DataFrame(out).astype(int)


In [None]:
features = pd.DataFrame(index=df.index)

# Age group ordinal
features["age_group_ord"] = encode_age_group(df.get("age_group"))

# Gender one hot
if "gender" in df.columns:
    gender_ohe = one_hot(df["gender"], prefix="gender")
    features = pd.concat([features, gender_ohe], axis=1)

features.head()


In [None]:
# Current country one hot
if "current_country" in df.columns:
    country_ohe = one_hot(df["current_country"], prefix="current_country")
    features = pd.concat([features, country_ohe], axis=1)

# Reason for current country one hot
if "reason_current_country" in df.columns:
    reason_ohe = one_hot(df["reason_current_country"], prefix="reason_current")
    features = pd.concat([features, reason_ohe], axis=1)

features.head()


In [None]:
# Marital status
if "marital_status" in df.columns:
    marital_ohe = one_hot(df["marital_status"], prefix="marital")
    features = pd.concat([features, marital_ohe], axis=1)

# Dependents band as categorical
if "dependents" in df.columns:
    dep_ohe = one_hot(df["dependents"], prefix="dep_band")
    features = pd.concat([features, dep_ohe], axis=1)

    # Also simple approximate numeric estimate
    dep_map = {
        "0": 0,
        "1": 1,
        "2": 2,
        "3-4": 3.5,
        "5+": 5,
    }
    features["dependents_estimated"] = df["dependents"].map(dep_map)

features.head()


In [None]:
# Education level, treat as ordinal if you like, but for now one hot
if "education_level" in df.columns:
    edu_ohe = one_hot(df["education_level"], prefix="edu")
    features = pd.concat([features, edu_ohe], axis=1)

# Field of study (multi select)
if "field_of_study" in df.columns:
    field_map = {
        "field_engineering": "engineering",
        "field_healthcare": "medicine",
        "field_business": "business",
        "field_education": "education",
        "field_it": "computer science",
        "field_law": "law",
        "field_other": "other",
    }
    field_df = multi_select_contains(df["field_of_study"], field_map)
    features = pd.concat([features, field_df], axis=1)

# Employment status
if "employment_status" in df.columns:
    emp_ohe = one_hot(df["employment_status"], prefix="employment")
    features = pd.concat([features, emp_ohe], axis=1)

# Years of experience
if "experience_years" in df.columns:
    exp_ohe = one_hot(df["experience_years"], prefix="exp_years_band")
    features = pd.concat([features, exp_ohe], axis=1)

    exp_map = {
        "0-1": 0.5,
        "2-4": 3,
        "5-7": 6,
        "8-10": 9,
        "10+": 12,
    }
    features["experience_years_est"] = df["experience_years"].map(exp_map)

# Remote capability (we already created remote_capable in notebook 1)
if "remote_capable" in df.columns:
    features["remote_capable"] = df["remote_capable"].astype(int)

features.head()


In [None]:
lang_col = "languages_clean" if "languages_clean" in df.columns else "languages_raw"

if lang_col in df.columns:
    lang_map = {
        "lang_arabic": "arabic",
        "lang_english": "english",
        "lang_french": "french",
        "lang_german": "german",
        "lang_italian": "italian",
        "lang_spanish": "spanish",
        "lang_other": ",",  # any separator to catch additional
    }
    lang_df = multi_select_contains(df[lang_col], lang_map)
    features = pd.concat([features, lang_df], axis=1)

features.head()


In [None]:
# Relocation intent
if "relocation_intent" in df.columns:
    intent_ohe = one_hot(df["relocation_intent"], prefix="intent")
    features = pd.concat([features, intent_ohe], axis=1)

    # simple binary actively seeking
    features["actively_seeking"] = df["relocation_intent"].eq("Yes").astype(int)

# Relocation goal
if "relocation_goal" in df.columns:
    goal_ohe = one_hot(df["relocation_goal"], prefix="goal")
    features = pd.concat([features, goal_ohe], axis=1)

features.head()


In [None]:
# Budget numeric
if "budget_estimated_usd" in df.columns:
    features["budget_estimated_usd"] = df["budget_estimated_usd"]

# Budget band if needed
if "budget_band" in df.columns:
    budget_ohe = one_hot(df["budget_band"], prefix="budget_band")
    features = pd.concat([features, budget_ohe], axis=1)

# Ability to pay for visa
if "visa_budget_ability" in df.columns:
    visa_ability_map = {
        "Yes": 2,
        "Partially able": 1,
        "No": 0,
    }
    features["can_pay_visa_score"] = df["visa_budget_ability"].map(visa_ability_map)

features.head()


In [None]:
# Preferred regions (multi select text)
if "preferred_regions" in df.columns:
    pref_map = {
        "pref_gulf": "gulf",
        "pref_east_africa": "east africa",
        "pref_north_africa": "north africa",
        "pref_europe": "europe",
        "pref_uk_ireland": "uk / ireland",
        "pref_canada": "canada",
        "pref_usa": "usa",
        "pref_asia": "asia",
        "pref_anywhere": "anywhere",
    }
    pref_df = multi_select_contains(df["preferred_regions"], pref_map)
    features = pd.concat([features, pref_df], axis=1)

# Cultural preference
if "cultural_preference" in df.columns:
    cult_ohe = one_hot(df["cultural_preference"], prefix="cult_pref")
    features = pd.concat([features, cult_ohe], axis=1)

features.head()


In [None]:
# Passport status cleaned from notebook 1
if "passport_status" in df.columns:
    pass_ohe = one_hot(df["passport_status"], prefix="passport")
    features = pd.concat([features, pass_ohe], axis=1)

# Visa preference
if "visa_preference" in df.columns:
    visa_pref_ohe = one_hot(df["visa_preference"], prefix="visa_pref")
    features = pd.concat([features, visa_pref_ohe], axis=1)

features.head()


In [None]:
# Support needed (single choice)
if "support_needed" in df.columns:
    support_ohe = one_hot(df["support_needed"], prefix="support")
    features = pd.concat([features, support_ohe], axis=1)

# Special needs
if "special_needs" in df.columns:
    special_ohe = one_hot(df["special_needs"], prefix="special_needs")
    features = pd.concat([features, special_ohe], axis=1)

features.head()


In [None]:
print("Features shape:", features.shape)
features.head()
features.columns.tolist()

In [None]:
out_path = PROCESSED_DIR / "final_model_dataset.csv"
features.to_csv(out_path, index=False)
out_path
