# Basic data analysis on the background(military, major, etc) of the astronauts and the astronauts selection.

In [None]:
# Import necessary libraries
import pandas as pd
import unicodedata
import numpy as np

# Read the datasets
astronauts_df = pd.read_csv('astronauts.csv', encoding='latin1')
international_df = pd.read_csv('International Astronaut Database.csv', encoding='latin1')

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
0,Joseph M. Acaba,2004.0,19.0,Active,5/17/1967,"Inglewood, CA",Male,University of California-Santa Barbara; Univer...,Geology,Geology,,,2,3307,2,13.0,"STS-119 (Discovery), ISS-31/32 (Soyuz)",,
1,Loren W. Acton,,,Retired,3/7/1936,"Lewiston, MT",Male,Montana State University; University of Colorado,Engineering Physics,Solar Physics,,,1,190,0,0.0,STS 51-F (Challenger),,
2,James C. Adamson,1984.0,10.0,Retired,3/3/1946,"Warsaw, NY",Male,US Military Academy; Princeton University,Engineering,Aerospace Engineering,Colonel,US Army (Retired),2,334,0,0.0,"STS-28 (Columbia), STS-43 (Atlantis)",,
3,Thomas D. Akers,1987.0,12.0,Retired,5/20/1951,"St. Louis, MO",Male,University of Missouri-Rolla,Applied Mathematics,Applied Mathematics,Colonel,US Air Force (Retired),4,814,4,29.0,"STS-41 (Discovery), STS-49 (Endeavor), STS-61 ...",,
4,Buzz Aldrin,1963.0,3.0,Retired,1/20/1930,"Montclair, NJ",Male,US Military Academy; MIT,Mechanical Engineering,Astronautics,Colonel,US Air Force (Retired),2,289,2,8.0,"Gemini 12, Apollo 11",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,David A. Wolf,1990.0,13.0,Retired,8/23/1956,"Indianapolis, IN",Male,Purdue University; Indiana University,Electrical Engineering,Medicine,,,3,4044,7,41.0,STS-58 (Columbia). STS-86/89 (Atlantis/Endeavo...,,
353,Neil W. Woodward III,1998.0,17.0,Retired,7/26/1962,"Chicago, IL",Male,MIT; University of Texas-Austin; George Washin...,Physics,Physics; Business Management,Commander,US Navy,0,0,0,0.0,,,
354,Alfred M. Worden,1966.0,5.0,Retired,2/7/1932,"Jackson, MI",Male,US Military Academy; University of Michigan,Military Science,Aeronautical & Astronautical Engineering,Colonel,US Air Force (Retired),1,295,1,0.5,Apollo 15,,
355,John W. Young,1962.0,2.0,Retired,9/24/1930,"San Francisco, CA",Male,Georgia Institute of Technology,Aeronautical Engineering,,Captain,US Navy (Retired),6,835,3,20.0,"Gemini 3, Gemini 10, Apollo 10, Apollo 16, STS...",,


In [21]:
# Normalize names for matching
def normalize_name(s: str):
  if pd.isna(s):
    return None
  s = unicodedata.normalize("NFKD", str(s)).encode("ascii","ignore").decode("ascii")
  s = s.lower().strip()
  for ch in [",", ".", "'", '"']: 
    s = s.replace(ch, " ")
  s = s.replace(" jr", "").replace(" sr", "")
  return " ".join(s.split())

# Convert ddd:hh:mm format to total hours
# Make sure the time data matches for both datasets
def parse_dddhhmm(s):
  """ddd:hh:mm  ---> total hours"""
  if pd.isna(s):
    return np.nan
  try:
    d,h,m = map(int, s.split(":"))
    return d*24 + h + m/60
  except Exception:
    try:
      return float(s)
    except: return np.nan

In [23]:
# Keys and helpers
astronauts_df["name_key"] = astronauts_df["Name"].map(normalize_name)
international_df["name_key"] = international_df["Name"].map(normalize_name)

astronauts_df["birth_year"] = pd.to_datetime(astronauts_df["Birth Date"], errors="coerce").dt.year

# Convert flight time to hours
international_df["total_flight_hours"] = international_df["Total Flight Time (ddd:hh:mm)"].map(parse_dddhhmm)
international_df["total_flight_days"]  = international_df["total_flight_hours"] / 24.0

In [24]:
# Merge the two datasets
merged = (astronauts_df.merge(international_df[["name_key","Country","Gender","Total Flights","total_flight_days"]],on="name_key", how="left", suffixes=("", "_iad")))

merged

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,...,Space Walks (hr),Missions,Death Date,Death Mission,name_key,birth_year,Country,Gender_iad,Total Flights,total_flight_days
0,Joseph M. Acaba,2004.0,19.0,Active,5/17/1967,"Inglewood, CA",Male,University of California-Santa Barbara; Univer...,Geology,Geology,...,13.0,"STS-119 (Discovery), ISS-31/32 (Soyuz)",,,joseph m acaba,1967,United States,Man,3.0,305.024306
1,Loren W. Acton,,,Retired,3/7/1936,"Lewiston, MT",Male,Montana State University; University of Colorado,Engineering Physics,Solar Physics,...,0.0,STS 51-F (Challenger),,,loren w acton,1936,,,,
2,James C. Adamson,1984.0,10.0,Retired,3/3/1946,"Warsaw, NY",Male,US Military Academy; Princeton University,Engineering,Aerospace Engineering,...,0.0,"STS-28 (Columbia), STS-43 (Atlantis)",,,james c adamson,1946,,,,
3,Thomas D. Akers,1987.0,12.0,Retired,5/20/1951,"St. Louis, MO",Male,University of Missouri-Rolla,Applied Mathematics,Applied Mathematics,...,29.0,"STS-41 (Discovery), STS-49 (Endeavor), STS-61 ...",,,thomas d akers,1951,,,,
4,Buzz Aldrin,1963.0,3.0,Retired,1/20/1930,"Montclair, NJ",Male,US Military Academy; MIT,Mechanical Engineering,Astronautics,...,8.0,"Gemini 12, Apollo 11",,,buzz aldrin,1930,United States,Man,2.0,12.078472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,David A. Wolf,1990.0,13.0,Retired,8/23/1956,"Indianapolis, IN",Male,Purdue University; Indiana University,Electrical Engineering,Medicine,...,41.0,STS-58 (Columbia). STS-86/89 (Atlantis/Endeavo...,,,david a wolf,1956,,,,
353,Neil W. Woodward III,1998.0,17.0,Retired,7/26/1962,"Chicago, IL",Male,MIT; University of Texas-Austin; George Washin...,Physics,Physics; Business Management,...,0.0,,,,neil w woodward iii,1962,,,,
354,Alfred M. Worden,1966.0,5.0,Retired,2/7/1932,"Jackson, MI",Male,US Military Academy; University of Michigan,Military Science,Aeronautical & Astronautical Engineering,...,0.5,Apollo 15,,,alfred m worden,1932,,,,
355,John W. Young,1962.0,2.0,Retired,9/24/1930,"San Francisco, CA",Male,Georgia Institute of Technology,Aeronautical Engineering,,...,20.0,"Gemini 3, Gemini 10, Apollo 10, Apollo 16, STS...",,,john w young,1930,,,,


In [29]:
# Engineer the careerpath features

# 1. Flag military background
merged["is_military"] = merged["Military Branch"].fillna("").str.strip().ne("")

# 2. Map undergraduate majors to broad categories
def map_major(major):
    if pd.isna(major): return "other/unknown"
    major = str(major).lower().strip()
    if "engineer" in major: return "engineering"
    if "physics" in major: return "physical_sciences"
    if "chemistry" in major: return "physical_sciences"
    if "bio" in major: return "life_sciences"
    if "med" in major: return "medical"
    if "math" in major: return "math"
    return "other/unknown"

merged["ug_discipline"] = merged["Undergraduate Major"].map(map_major)

# 3. Create broad career pathway categories
merged["pathway"] = np.select(
    [
        merged["is_military"],
        merged["Alma Mater"].notna() & merged["Alma Mater"].astype(str).str.strip().ne(""),
        merged["Undergraduate Major"].notna() & merged["Undergraduate Major"].astype(str).str.strip().ne("")
    ],
    ["Military", "Academic", "STEM"],
    default="Other"
)

merged


Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,...,Death Mission,name_key,birth_year,Country,Gender_iad,Total Flights,total_flight_days,is_military,pathway,ug_discipline
0,Joseph M. Acaba,2004.0,19.0,Active,5/17/1967,"Inglewood, CA",Male,University of California-Santa Barbara; Univer...,Geology,Geology,...,,joseph m acaba,1967,United States,Man,3.0,305.024306,False,Academic,other/unknown
1,Loren W. Acton,,,Retired,3/7/1936,"Lewiston, MT",Male,Montana State University; University of Colorado,Engineering Physics,Solar Physics,...,,loren w acton,1936,,,,,False,Academic,engineering
2,James C. Adamson,1984.0,10.0,Retired,3/3/1946,"Warsaw, NY",Male,US Military Academy; Princeton University,Engineering,Aerospace Engineering,...,,james c adamson,1946,,,,,True,Military,engineering
3,Thomas D. Akers,1987.0,12.0,Retired,5/20/1951,"St. Louis, MO",Male,University of Missouri-Rolla,Applied Mathematics,Applied Mathematics,...,,thomas d akers,1951,,,,,True,Military,math
4,Buzz Aldrin,1963.0,3.0,Retired,1/20/1930,"Montclair, NJ",Male,US Military Academy; MIT,Mechanical Engineering,Astronautics,...,,buzz aldrin,1930,United States,Man,2.0,12.078472,True,Military,engineering
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,David A. Wolf,1990.0,13.0,Retired,8/23/1956,"Indianapolis, IN",Male,Purdue University; Indiana University,Electrical Engineering,Medicine,...,,david a wolf,1956,,,,,False,Academic,engineering
353,Neil W. Woodward III,1998.0,17.0,Retired,7/26/1962,"Chicago, IL",Male,MIT; University of Texas-Austin; George Washin...,Physics,Physics; Business Management,...,,neil w woodward iii,1962,,,,,True,Military,physical_sciences
354,Alfred M. Worden,1966.0,5.0,Retired,2/7/1932,"Jackson, MI",Male,US Military Academy; University of Michigan,Military Science,Aeronautical & Astronautical Engineering,...,,alfred m worden,1932,,,,,True,Military,other/unknown
355,John W. Young,1962.0,2.0,Retired,9/24/1930,"San Francisco, CA",Male,Georgia Institute of Technology,Aeronautical Engineering,,...,,john w young,1930,,,,,True,Military,engineering


# 1. Pathway distribution & outcomes

In [31]:
pathway_summary = (
    merged.assign(multi_mission = (merged["Total Flights"].fillna(0) > 1) |
                                   (merged["Space Flights"].fillna(0) > 1))
          .groupby("pathway", dropna=False)
          .agg(astronauts=("Name","count"),
               avg_total_flights=("Total Flights","mean"),
               pct_multi_mission=("multi_mission","mean"),
               avg_total_days_in_space=("total_flight_days","mean"))
          .reset_index()
)

pathway_summary

Unnamed: 0,pathway,astronauts,avg_total_flights,pct_multi_mission,avg_total_days_in_space
0,Academic,146,2.135135,0.630137,106.139039
1,Military,211,2.568627,0.748815,50.566953


# 2. Military vs civilian comparison

In [33]:
military_vs_civil = (
    merged.groupby("is_military")
          .agg(astronauts=("Name","count"),
               avg_total_flights=("Total Flights","mean"),
               avg_days_in_space=("total_flight_days","mean"))
          .reset_index()
)

military_vs_civil

Unnamed: 0,is_military,astronauts,avg_total_flights,avg_days_in_space
0,False,146,2.135135,106.139039
1,True,211,2.568627,50.566953


# 3. Top undergraduate major choices

In [36]:
major_summary = (
    merged.assign(UndergraduateMajor = merged["Undergraduate Major"].astype(str).str.strip())
          .groupby("Undergraduate Major")
          .agg(astronauts=("Name","count"),
               avg_total_flights=("Total Flights","mean"),
               avg_days_in_space=("total_flight_days","mean"))
          .sort_values("astronauts", ascending=False)
          .head(15)
          .reset_index()
)

major_summary.index = major_summary.index + 1
major_summary.index.name = "Rank"

major_summary

Unnamed: 0_level_0,Undergraduate Major,astronauts,avg_total_flights,avg_days_in_space
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Physics,35,2.444444,27.698688
2,Aerospace Engineering,33,2.444444,266.232485
3,Mechanical Engineering,30,2.428571,57.228968
4,Aeronautical Engineering,28,2.5,24.446875
5,Electrical Engineering,23,1.666667,15.171065
6,Engineering Science,13,3.5,28.157292
7,Engineering,12,,
8,Mathematics,11,3.333333,35.844676
9,Chemistry,10,2.5,28.703125
10,Chemical Engineering,9,2.5,71.543403
