# Sudanese Relocation Recommender – Data Collection and Overview

This notebook loads the raw survey responses, performs basic cleaning, and creates
initial derived features such as:

- Clean budget estimate  
- Passport status category  
- Remote work capability flag  
- Normalized column names

The cleaned dataset will later be used for feature engineering, clustering, and modeling.

In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

PROJECT_ROOT = Path.cwd().parent if (Path.cwd() / "src").exists() else Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
INTERMEDIATE_DIR = DATA_DIR / "intermediate"

RAW_DIR.mkdir(parents=True, exist_ok=True)
INTERMEDIATE_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
raw_file = RAW_DIR / "Sudanese Relocation Insights_ A Study (Responses).xlsx"

df_raw = pd.read_excel(raw_file)

df_raw.head()

df_raw.columns.tolist()


Unnamed: 0,Timestamp,Age Group,Gender,Current Location,Reason for Current Country of Residence,Marital Status,Number of Dependents\Family Members moving with you,Highest Level of Education,Field of Study,Current Employment Status,...,Are you seeking to relocate?,"If you're seeking to relocate, what would your main goal be?",Monthly Budget for Living Expenses (USD),Ability to Pay for Relocation/Visa Fees,Preferred Regions for Relocation,Cultural Preferences,Do you currently have a passport?,Visa Restrictions,Support Needed,Medical or special needs to consider?
0,2025-11-25 20:13:11.070,25-34,Male,United Arab Emirates,For employment opportunities,Single,0,Bachelor's Degree,Engineering,Employed full-time,...,Yes,Employment,$200–500,Yes,"Europe, Asia (Malaysia, China)",Prefer Arabic-speaking countries,"Yes, and it is valid",Easy visa required,Legal/immigration support,No
1,2025-11-25 20:15:43.406,25-34,Male,Uganda,For employment opportunities,Single,0,Bachelor's Degree,Pharmacist,Unemployed (looking for work),...,Yes,Start or expand a business,$200–500,Yes,Canada,Prefer Western countries,"Yes, and it is valid",Willing to go through long visa process,Job placement,No
2,2025-11-25 20:19:44.095,25-34,Male,Egypt,Fled the war / seeking safety,Single,3-4,Bachelor's Degree,Engineering,Employed full-time,...,Yes,Employment,$200–500,No,"Gulf countries (UAE, Saudi, Qatar), Europe, UK...",Prefer Western countries,"Yes, and it is valid",Willing to go through long visa process,Job placement,No
3,2025-11-25 20:21:24.550,25-34,Male,Saudi Arabia,Fled the war / seeking safety,Single,3-4,Bachelor's Degree,Engineering,Unemployed (looking for work),...,Not sure yet,Improve safety and stability,"$500–1,000",Partially able,"Gulf countries (UAE, Saudi, Qatar)",Prefer Arabic-speaking countries,"Yes, and it is valid",Only countries with sponsorship opportunities,Job placement,Prefer not to say
4,2025-11-25 20:25:13.413,25-34,Female,United Arab Emirates,For employment opportunities,Single,2,Bachelor's Degree,Economic &Political Science,Employed full-time,...,No,Not seeking to relocate,"$1,000–2,500",Yes,"Gulf countries (UAE, Saudi, Qatar), Europe, UK...",No strong preference,"Yes, and it is valid",Easy visa required,Medical support,No


In [None]:
rename_map = {
    "Age Group ": "age_group",
    "Gender ": "gender",
    "Current Location ": "current_country",
    "Reason for Current Country of Residence": "reason_current_country",
    "Marital Status ": "marital_status",
    "Number of Dependents\\Family Members moving with you ": "dependents",
    "Highest Level of Education  ": "education_level",
    "Field of Study ": "field_of_study",
    "Current Employment Status ": "employment_status",
    "Years of Professional Experience  ": "experience_years",
    "Are you able to work remotely?": "remote_work",
    "What Languages do you speak? ": "languages_raw",
    "Are you seeking to relocate? ": "relocation_intent",
    "If you're seeking to relocate, what would your main goal be?": "relocation_goal",
    "Monthly Budget for Living Expenses (USD)  ": "budget_band",
    "Ability to Pay for Relocation/Visa Fees": "visa_budget_ability",
    "Preferred Regions for Relocation ": "preferred_regions",
    "Cultural Preferences  ": "cultural_preference",
    "Do you currently have a passport?": "passport_status_raw",
    "Visa Restrictions  ": "visa_preference",
    "Support Needed *": "support_needed",
    "Medical or special needs to consider?  ": "special_needs",
}

df = df_raw.rename(columns=rename_map).copy()

df.head()


In [None]:
def clean_budget(val):
    """
    Map budget ranges to approximate numeric values in USD.
    """
    if pd.isna(val):
        return np.nan
    s = str(val).replace(" ", "").replace("–", "-")  # normalize dash
    if "<$200" in s:
        return 150
    if "$200-500" in s or "200-500" in s:
        return 350
    if "$500-1,000" in s or "500-1000" in s:
        return 750
    if "$1,000-2,500" in s or "1000-2500" in s:
        return 1750
    if "$2,500-5,000" in s or "2500-5000" in s:
        return 3750
    if ">$5,000" in s or ">5000" in s:
        return 6000
    return np.nan


def clean_passport(val):
    """
    Normalize passport status to high level categories:
    Valid, ExpiringSoon, Expired, None.
    """
    if pd.isna(val):
        return "None"
    s = str(val).strip().lower()
    if "valid" in s:
        return "Valid"
    if "expire soon" in s or "within 6 months" in s:
        return "ExpiringSoon"
    if "expired" in s:
        return "Expired"
    if "no" in s or "do not have" in s or "in the process" in s:
        return "None"
    return "None"


def clean_remote(val):
    """
    Map remote work answer to a boolean flag.
    """
    if pd.isna(val):
        return False
    s = str(val).lower()
    if "laptop" in s or "remotely" in s or "freelance" in s or "online" in s:
        return True
    return False


In [None]:
df["budget_estimated_usd"] = df["budget_band"].apply(clean_budget)
df["passport_status"] = df["passport_status_raw"].apply(clean_passport)
df["remote_capable"] = df["remote_work"].apply(clean_remote)

# Clean language text to lower case, will later explode into flags
df["languages_clean"] = df["languages_raw"].fillna("").astype(str).str.lower()

df[[
    "age_group",
    "gender",
    "current_country",
    "budget_band",
    "budget_estimated_usd",
    "passport_status",
    "remote_capable",
    "languages_clean",
]].head()


In [None]:
print("Number of responses:", len(df))

print("\nCurrent country distribution:")
print(df["current_country"].value_counts(dropna=False))

print("\nRelocation intent distribution:")
print(df["relocation_intent"].value_counts(dropna=False))

print("\nBudget band distribution:")
print(df["budget_band"].value_counts(dropna=False))

print("\nPassport status distribution (cleaned):")
print(df["passport_status"].value_counts(dropna=False))


In [None]:
import matplotlib.pyplot as plt

df["current_country"].value_counts().plot(kind="bar")
plt.xticks(rotation=45, ha="right")
plt.title("Current country of residence")
plt.tight_layout()
plt.show()

In [None]:
out_path = INTERMEDIATE_DIR / "cleaned_responses.csv"
INTERMEDIATE_DIR.mkdir(parents=True, exist_ok=True)

df.to_csv(out_path, index=False)
out_path
