# 02 â€“ Exploratory Data Analysis (EDA)

In this notebook we:

- Load the cleaned survey responses
- Explore distributions of key variables
- Look at relationships between current location, relocation intent, budget, and preferences
- Generate figures to be reused in the report and README


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

PROJECT_ROOT = Path.cwd().parent if (Path.cwd() / "data").exists() else Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
INTERMEDIATE_DIR = DATA_DIR / "intermediate"
FIGURES_DIR = PROJECT_ROOT / "figures" / "eda"

FIGURES_DIR.mkdir(parents=True, exist_ok=True)

clean_file = INTERMEDIATE_DIR / "cleaned_responses.csv"
df = pd.read_csv(clean_file)

df.head()

In [None]:
df.info()
df.isna().mean().sort_values(ascending=False).head(15)

In [None]:
print("Age group distribution:")
print(df["age_group"].value_counts(dropna=False))
print("\nGender distribution:")
print(df["gender"].value_counts(dropna=False))

In [None]:
def plot_bar(series, title, filename):
    counts = series.value_counts()
    plt.figure()
    counts.plot(kind="bar")
    plt.xticks(rotation=45, ha="right")
    plt.title(title)
    plt.tight_layout()
    out_path = FIGURES_DIR / filename
    plt.savefig(out_path, dpi=300)
    plt.show()
    return out_path

plot_bar(df["age_group"], "Age group distribution", "age_group_distribution.png")
plot_bar(df["gender"], "Gender distribution", "gender_distribution.png")


In [None]:
print("Current country distribution:")
print(df["current_country"].value_counts(dropna=False))

print("\nRelocation intent:")
print(df["relocation_intent"].value_counts(dropna=False))


In [None]:
plot_bar(df["current_country"], "Current country of residence", "current_country_distribution.png")
plot_bar(df["relocation_intent"], "Relocation intent", "relocation_intent_distribution.png")


In [None]:
pivot_intent_by_country = pd.crosstab(df["current_country"], df["relocation_intent"], normalize="index")
pivot_intent_by_country


In [None]:
pivot_intent_by_country.plot(kind="bar", stacked=True)
plt.xticks(rotation=45, ha="right")
plt.title("Relocation intent by current country (row normalised)")
plt.tight_layout()
plt.savefig(FIGURES_DIR / "relocation_intent_by_country.png", dpi=300)
plt.show()

In [None]:
print("Budget band distribution:")
print(df["budget_band"].value_counts(dropna=False))

print("\nPassport status (cleaned):")
print(df["passport_status"].value_counts(dropna=False))


In [None]:
plot_bar(df["budget_band"], "Monthly budget band (USD)", "budget_band_distribution.png")
plot_bar(df["passport_status"], "Passport status", "passport_status_distribution.png")
df["budget_estimated_usd"].describe()

In [None]:
plt.figure()
df["budget_estimated_usd"].dropna().hist(bins=10)
plt.xlabel("Estimated monthly budget (USD)")
plt.ylabel("Count")
plt.title("Distribution of estimated monthly budget")
plt.tight_layout()
plt.savefig(FIGURES_DIR / "budget_estimated_hist.png", dpi=300)
plt.show()

In [None]:
print("Education level distribution:")
print(df["education_level"].value_counts(dropna=False))

print("\nEmployment status:")
print(df["employment_status"].value_counts(dropna=False))

print("\nYears of experience:")
print(df["experience_years"].value_counts(dropna=False))


In [None]:
plot_bar(df["education_level"], "Highest education level", "education_level_distribution.png")
plot_bar(df["employment_status"], "Employment status", "employment_status_distribution.png")
plot_bar(df["experience_years"], "Years of professional experience", "experience_years_distribution.png")

In [None]:
plt.figure()
df.boxplot(column="budget_estimated_usd", by="relocation_intent")
plt.ylabel("Estimated monthly budget (USD)")
plt.title("Budget by relocation intent")
plt.suptitle("")
plt.tight_layout()
plt.savefig(FIGURES_DIR / "budget_by_relocation_intent.png", dpi=300)
plt.show()


In [None]:
print("Dependents band:")
print(df["dependents"].value_counts(dropna=False))
plot_bar(df["dependents"], "Dependents / family members moving", "dependents_distribution.png")


In [None]:
pd.crosstab(df["dependents"], df["relocation_intent"], normalize="index")

In [None]:
lang_counts = (
    df["languages_clean"]
    .str.get_dummies(sep=",")  # only if you separated by comma earlier
    .sum()
    .sort_values(ascending=False)
)
lang_counts
print("Preferred regions raw:")
print(df["preferred_regions"].value_counts(dropna=False)[:10])

In [None]:
summary = {
    "n_responses": len(df),
    "pct_seeking_relocation": (df["relocation_intent"].eq("Yes").mean() * 100).round(1),
    "median_budget": df["budget_estimated_usd"].median(),
}

summary