In [2]:
import os
import requests
import pandas as pd

In [3]:
# List of NHANES cycles to download data from (you can update this as needed)
years = ['2015-2016', '2017-2018']  # Add more cycles if needed

# Base URL for NHANES data
base_url = "https://wwwn.cdc.gov/nchs/nhanes/{year}/"

# List of datasets to download for each cycle (you can add more if necessary)
datasets = {
    'DEMO': 'DEMO_G.csv',  # Demographics
    'DR1TOTX': 'DR1TOTX.csv',  # Dietary (First Recall)
    'DR2TOTX': 'DR2TOTX.csv',  # Dietary (Second Recall)
    'LAB': 'LAB_G.csv',  # Laboratory
    'EXAM': 'EXAM_G.csv',  # Physical Exam
    'RXQ': 'RXQ_G.csv',  # Medications
}

# Function to download files
def download_file(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download {filename}.")

# Download the data for each year and dataset
def download_nhanes_data(years, datasets):
    if not os.path.exists('nhanes_data'):
        os.makedirs('nhanes_data')

    for year in years:
        for dataset, filename in datasets.items():
            url = base_url.format(year=year) + dataset
            download_file(url, os.path.join('nhanes_data', filename))

# Function to load and merge the data for a specific cycle
def load_and_merge_data(years, datasets):
    merged_df = pd.DataFrame()
    
    for year in years:
        year_data = []
        
        for dataset, filename in datasets.items():
            # Construct the file path
            file_path = os.path.join('nhanes_data', filename)
            
            # Check if the file exists, and if so, read it
            if os.path.exists(file_path):
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path)
                
                # Merge data for the year
                if merged_df.empty:
                    merged_df = df
                else:
                    merged_df = pd.merge(merged_df, df, on='SEQN', how='outer')
                    
        year_filename = f"merged_nhanes_{year}.csv"
        merged_df.to_csv(os.path.join('nhanes_data', year_filename), index=False)
        print(f"Data for {year} merged and saved as {year_filename}")
    
    return merged_df

# Step 1: Download the data files for each year
download_nhanes_data(years, datasets)

# Step 2: Load and merge the data across years
merged_data = load_and_merge_data(years, datasets)

# Step 3: Inspect the merged data
print(f"Merged data shape: {merged_data.shape}")
print(merged_data.head())

# Optionally save the final merged dataset (across all years)
merged_data.to_csv('merged_nhanes_all_years.csv', index=False)


Downloaded: nhanes_data\DEMO_G.csv
Downloaded: nhanes_data\DR1TOTX.csv
Downloaded: nhanes_data\DR2TOTX.csv
Downloaded: nhanes_data\LAB_G.csv
Downloaded: nhanes_data\EXAM_G.csv
Downloaded: nhanes_data\RXQ_G.csv
Downloaded: nhanes_data\DEMO_G.csv
Downloaded: nhanes_data\DR1TOTX.csv
Downloaded: nhanes_data\DR2TOTX.csv
Downloaded: nhanes_data\LAB_G.csv
Downloaded: nhanes_data\EXAM_G.csv
Downloaded: nhanes_data\RXQ_G.csv


ParserError: Error tokenizing data. C error: Expected 1 fields in line 10, saw 3


In [1]:
!pip install pyreadstat

Collecting pyreadstat
  Downloading pyreadstat-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.1 kB)
Downloading pyreadstat-1.2.8-cp312-cp312-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
    --------------------------------------- 0.0/2.4 MB 325.1 kB/s eta 0:00:08
   - -------------------------------------- 0.1/2.4 MB 871.5 kB/s eta 0:00:03
   -------- ------------------------------- 0.5/2.4 MB 3.2 MB/s eta 0:00:01
   ------------------------------- -------- 1.9/2.4 MB 9.3 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 MB 10.3 MB/s eta 0:00:00
Installing collected packages: pyreadstat
Successfully installed pyreadstat-1.2.8



[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import requests
from io import BytesIO

def load_nhanes_data(url):
    """Download and load an NHANES dataset from the given URL."""
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the download failed
    return pd.read_sas(BytesIO(response.content), format='xport')

# Define the URLs for each dataset for three cycles
cycles = {
    "2017": {  # 2017-2018 cycle
        "dr1iff": "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR1IFF_J.xpt",
        "dr1tot": "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR1TOT_J.XPT",
        "diq":    "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DIQ_J.xpt",
        "glu":    "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/GLU_J.xpt",
        "bmx":    "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/BMX_J.xpt",
        "demo":   "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_J.xpt"
    },
    "2015": {  # 2015-2016 cycle
        "dr1iff": "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DR1IFF_I.xpt",
        "dr1tot": "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DR1TOT_I.XPT",
        "diq":    "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DIQ_I.xpt",
        "glu":    "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/GLU_I.xpt",
        "bmx":    "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/BMX_I.xpt",
        "demo":   "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DEMO_I.xpt"
    },
}

merged_cycles = []

# Process each cycle separately
for cycle, url_dict in cycles.items():
    print(f"Processing NHANES cycle: {cycle}")
    
    # Download and subset useful columns for each dataset
    demo = load_nhanes_data(url_dict["demo"])[["SEQN", "RIDAGEYR", "RIAGENDR", "RIDRETH1"]]
    dr1tot = load_nhanes_data(url_dict["dr1tot"])[["SEQN", "DR1TKCAL", "DR1TPROT", "DR1TCARB", "DR1TTFAT", "DR1TSFAT"]]
    dr1iff = load_nhanes_data(url_dict["dr1iff"])[["SEQN", "DR1IFDCD"]]
    diq = load_nhanes_data(url_dict["diq"])[["SEQN", "DIQ010"]]
    glu = load_nhanes_data(url_dict["glu"])[["SEQN", "LBXGLU"]]
    bmx = load_nhanes_data(url_dict["bmx"])[["SEQN", "BMXWT", "BMXHT", "BMXBMI"]]
    
    # Rename columns to user-friendly names
    demo = demo.rename(columns={
        "SEQN": "ParticipantID",
        "RIDAGEYR": "Age",
        "RIAGENDR": "Gender",         # 1 = Male, 2 = Female
        "RIDRETH1": "RaceEthnicity"
    })
    
    dr1tot = dr1tot.rename(columns={
        "SEQN": "ParticipantID",
        "DR1TKCAL": "TotalCalories",
        "DR1TPROT": "Protein",
        "DR1TCARB": "Carbohydrates",
        "DR1TTFAT": "TotalFat",
        "DR1TSFAT": "SaturatedFat"
    })
    
    dr1iff = dr1iff.rename(columns={
        "SEQN": "ParticipantID",
        "DR1IFDCD": "FoodCode"
    })
    
    diq = diq.rename(columns={
        "SEQN": "ParticipantID",
        "DIQ010": "DiabetesDiagnosis"  # Typically: 1 = Yes, 2 = No
    })
    
    glu = glu.rename(columns={
        "SEQN": "ParticipantID",
        "LBXGLU": "BloodGlucose"
    })
    
    bmx = bmx.rename(columns={
        "SEQN": "ParticipantID",
        "BMXWT": "Weight",
        "BMXHT": "Height",
        "BMXBMI": "BMI"
    })
    
    # Merge datasets for this cycle on ParticipantID
    merged_df = demo.merge(dr1tot, on="ParticipantID", how="inner") \
                    .merge(dr1iff, on="ParticipantID", how="inner") \
                    .merge(diq, on="ParticipantID", how="inner") \
                    .merge(glu, on="ParticipantID", how="inner") \
                    .merge(bmx, on="ParticipantID", how="inner")
    
    # Add a column to indicate the NHANES cycle
    merged_df["Cycle"] = cycle
    merged_cycles.append(merged_df)

# Concatenate data from all cycles into a single DataFrame
combined_df = pd.concat(merged_cycles, ignore_index=True)

# Save the combined dataset to CSV
combined_df.to_csv("combined_nhanes_data.csv", index=False)
print("Combined dataset saved to 'combined_nhanes_data.csv'")


Processing NHANES cycle: 2017
Processing NHANES cycle: 2015
Combined dataset saved to 'combined_nhanes_data.csv'


In [5]:
combined_df.shape

(84030, 16)

In [6]:
combined_df.head()

Unnamed: 0,ParticipantID,Age,Gender,RaceEthnicity,TotalCalories,Protein,Carbohydrates,TotalFat,SaturatedFat,FoodCode,DiabetesDiagnosis,BloodGlucose,Weight,Height,BMI,Cycle
0,93708.0,66.0,2.0,5.0,1251.0,50.96,123.71,65.49,17.446,53233060.0,3.0,122.0,53.5,150.2,23.7,2017
1,93708.0,66.0,2.0,5.0,1251.0,50.96,123.71,65.49,17.446,94100100.0,3.0,122.0,53.5,150.2,23.7,2017
2,93708.0,66.0,2.0,5.0,1251.0,50.96,123.71,65.49,17.446,54403054.0,3.0,122.0,53.5,150.2,23.7,2017
3,93708.0,66.0,2.0,5.0,1251.0,50.96,123.71,65.49,17.446,92101000.0,3.0,122.0,53.5,150.2,23.7,2017
4,93708.0,66.0,2.0,5.0,1251.0,50.96,123.71,65.49,17.446,11100000.0,3.0,122.0,53.5,150.2,23.7,2017


In [7]:
combined_df.ParticipantID.nunique()

5707

In [8]:
combined_df.columns

Index(['ParticipantID', 'Age', 'Gender', 'RaceEthnicity', 'TotalCalories',
       'Protein', 'Carbohydrates', 'TotalFat', 'SaturatedFat', 'FoodCode',
       'DiabetesDiagnosis', 'BloodGlucose', 'Weight', 'Height', 'BMI',
       'Cycle'],
      dtype='object')