# Basic data analysis on the background(military, major, etc) of the astronauts and the astronauts selection.

In [2]:
# Import necessary libraries
import pandas as pd
import unicodedata
import numpy as np

# Read the datasets
astronauts_df = pd.read_csv('Datasets/astronauts.csv', encoding='latin1')
international_df = pd.read_csv('Datasets/International Astronaut Database.csv', encoding='latin1')

In [3]:
# Normalize names for matching
def normalize_name(s: str):
  if pd.isna(s):
    return None
  s = unicodedata.normalize("NFKD", str(s)).encode("ascii","ignore").decode("ascii")
  s = s.lower().strip()
  for ch in [",", ".", "'", '"']: 
    s = s.replace(ch, " ")
  s = s.replace(" jr", "").replace(" sr", "")
  return " ".join(s.split())

# Convert ddd:hh:mm format to total hours
# Make sure the time data matches for both datasets
def parse_dddhhmm(s):
  """ddd:hh:mm  ---> total hours"""
  if pd.isna(s):
    return np.nan
  try:
    d,h,m = map(int, s.split(":"))
    return d*24 + h + m/60
  except Exception:
    try:
      return float(s)
    except: return np.nan

In [4]:
# Keys and helpers
astronauts_df["name_key"] = astronauts_df["Name"].map(normalize_name)
international_df["name_key"] = international_df["Name"].map(normalize_name)

astronauts_df["birth_year"] = pd.to_datetime(astronauts_df["Birth Date"], errors="coerce").dt.year

# Convert flight time to hours
international_df["total_flight_hours"] = international_df["Total Flight Time (ddd:hh:mm)"].map(parse_dddhhmm)
international_df["total_flight_days"]  = international_df["total_flight_hours"] / 24.0

In [5]:
# Merge the two datasets
merged = (astronauts_df.merge(international_df[["name_key","Country","Gender","Total Flights","total_flight_days"]],on="name_key", how="left", suffixes=("", "_iad")))

In [6]:
# Engineer the careerpath features

# 1. Flag military background
merged["is_military"] = merged["Military Branch"].fillna("").str.strip().ne("")

# 2. Map undergraduate majors to broad categories
def map_major(major):
    if pd.isna(major): return "other/unknown"
    major = str(major).lower().strip()
    if "engineer" in major: return "engineering"
    if "physics" in major: return "physical_sciences"
    if "chemistry" in major: return "physical_sciences"
    if "bio" in major: return "life_sciences"
    if "med" in major: return "medical"
    if "math" in major: return "math"
    return "other/unknown"

merged["ug_discipline"] = merged["Undergraduate Major"].map(map_major)

# 3. Create broad career pathway categories
merged["pathway"] = np.select(
    [
        merged["is_military"],
        merged["Alma Mater"].notna() & merged["Alma Mater"].astype(str).str.strip().ne(""),
        merged["Undergraduate Major"].notna() & merged["Undergraduate Major"].astype(str).str.strip().ne("")
    ],
    ["Military", "Academic", "STEM"],
    default="Other"
)


# 1. Pathway distribution & outcomes

In [7]:
pathway_summary = (
    merged.assign(multi_mission = (merged["Total Flights"].fillna(0) > 1) |
                                   (merged["Space Flights"].fillna(0) > 1))
          .groupby("pathway", dropna=False)
          .agg(astronauts=("Name","count"),
               avg_total_flights=("Total Flights","mean"),
               pct_multi_mission=("multi_mission","mean"),
               avg_total_days_in_space=("total_flight_days","mean"))
          .reset_index()
)

# 2. Military vs civilian comparison

In [8]:
military_vs_civil = (
    merged.groupby("is_military")
          .agg(astronauts=("Name","count"),
               avg_total_flights=("Total Flights","mean"),
               avg_days_in_space=("total_flight_days","mean"))
          .reset_index()
)

# 3. Top undergraduate major choices

In [9]:
major_summary = (
    merged.assign(UndergraduateMajor = merged["Undergraduate Major"].astype(str).str.strip())
          .groupby("Undergraduate Major")
          .agg(astronauts=("Name","count"),
               avg_total_flights=("Total Flights","mean"),
               avg_days_in_space=("total_flight_days","mean"))
          .sort_values("astronauts", ascending=False)
          .head(15)
          .reset_index()
)

# Making separate csv files for outcomes

In [49]:
# Career pathway summary
pathway_summary.to_csv("career_pathway_summary.csv", index=False)

In [52]:
# Top Undergraduate Majors
major_summary.index = major_summary.index + 1
major_summary.index.name = "Rank"
major_summary.to_csv("top_undergraduate_majors_outcomes.csv")

In [54]:
# Military vs Civilian Outcomes
military_vs_civil.to_csv("military_vs_civil_outcomes.csv", index=False)

In [55]:
# Merged Dataset
merged.to_csv("merged_astronauts_dataset.csv", index=False)