## Data Acquisation and Cleaning

First, import the libraries and modules used, and use the Path module to create a 'safe' path for the data file, to be recognized in all OS

In [178]:
from pathlib import Path

import numpy as np # gotta check if np is ever used in my code
import pandas as pd
import matplotlib.pyplot as plt

DATA_DIR = Path().resolve()

Data_PATH = DATA_DIR / "raw_data" / "w1inhome_dvn.sav"

if not Data_PATH.exists():
    raise FileNotFoundError(f"Could not find dataset at {Data_PATH}")

plt.style.use("seaborn-v0_8-whitegrid")
pd.options.display.float_format = "{:.2f}".format # might want to change these plotting settings for later

Second, load the raw data into a DataFrame object, to perform cleaning operations

In [179]:
# now time for cleaning into a compact DataFrame:
import pyreadstat # to read the SPSS file
data, meta = pyreadstat.read_sav(Data_PATH) # data contains the data frame. meta contains the metadata and encodings for values     

we have a lot of variables, since this is microdata, from selected variables, I will compute new indices and discard old values used to compute them, this will drastically reduce the number of variables I will be working with.

In [180]:
data.head() # display first 5 entries

Unnamed: 0,AID,IMONTH,IDAY,IYEAR,SCH_YR,BIO_SEX,VERSION,SMP01,SMP03,H1GI1M,...,PD4A,PD4B,PD4C,PD4D,PD4E,PD4F,PD5,PD5A,AH_PVT,AH_RAW
0,57100270,6.0,23.0,95.0,1.0,2.0,4,0.0,1.0,10.0,...,7.0,7.0,7.0,7.0,7.0,7.0,1.0,1.0,86.0,55.0
1,57101310,5.0,5.0,95.0,1.0,2.0,1,1.0,0.0,11.0,...,,,,,,,,,88.0,58.0
2,57103171,6.0,27.0,95.0,0.0,1.0,4,1.0,0.0,10.0,...,7.0,7.0,7.0,7.0,7.0,7.0,1.0,0.0,120.0,79.0
3,57103869,7.0,14.0,95.0,0.0,1.0,4,1.0,0.0,1.0,...,7.0,7.0,7.0,7.0,7.0,7.0,,,85.0,56.0
4,57104553,7.0,14.0,95.0,1.0,2.0,4,1.0,0.0,6.0,...,7.0,7.0,7.0,7.0,7.0,7.0,1.0,0.0,90.0,59.0


From the in-home interview Notebook, I have obtained the codes for the variables that I am interested in, I have compiled them into a list, called ordered_columns.

The following are the codes with the corresponding interview question:


In [181]:
ordered_columns = ['AID', 'H1GI1M', 'H1GI1Y', 'BIO_SEX', 'H1GI9', 'IYEAR', 'H1SU1', 'H1SU2', 'H1RM1', 'H1RF1', 'H1FS1', 'H1FS2',
                    'H1FS3', 'H1FS4', 'H1FS5', 'H1FS6', 'H1FS7', 'H1FS8', 'H1FS9', 'H1FS10', 'H1FS11', 'H1FS12', 'H1FS13',
                      'H1FS14', 'H1FS15', 'H1FS16', 'H1FS17', 'H1FS18', 'H1GH18', 'H1GH21', 'H1PF16', 'H1WP10', 'H1PF1',
                        'H1PF2', 'H1PF3', 'H1PF4', 'H1WP9', 'H1WP14', 'H1PF23', 'H1PF24', 'H1PF25', 'H1WP13', 'H1WP1', 'H1WP2',
                          'H1WP3', 'H1WP4', 'H1WP5', 'H1WP6', 'H1WP7', 'H1FV1', 'H1FV2', 'H1FV3', 'H1FV4', 'H1FV5', 'H1FV6', 'H1FV7']

In [182]:
select_vars_data = data[ordered_columns] # in the new data frame, only include the ordered columns
select_vars_data.head()

Unnamed: 0,AID,H1GI1M,H1GI1Y,BIO_SEX,H1GI9,IYEAR,H1SU1,H1SU2,H1RM1,H1RF1,...,H1WP5,H1WP6,H1WP7,H1FV1,H1FV2,H1FV3,H1FV4,H1FV5,H1FV6,H1FV7
0,57100270,10.0,77.0,2.0,2.0,95.0,0.0,7.0,8.0,8.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
1,57101310,11.0,76.0,2.0,2.0,95.0,0.0,7.0,1.0,7.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,57103171,10.0,79.0,1.0,1.0,95.0,0.0,7.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,57103869,1.0,77.0,1.0,2.0,95.0,1.0,0.0,2.0,97.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,57104553,6.0,76.0,2.0,2.0,95.0,1.0,0.0,4.0,97.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Replacing Missingness-corresponding values with NaN:

Now that we have the dataframe with the 56 variables selected, we need to consider missing values.
In the codebook some values that are missing might have several numbers indicating it, so we need 
to replace it with NaN. While looking at the codebook, there were some questions that had different 
values for missingness, so one must accommodate for that fact.


In [183]:
# Common missing value keywords found in survey data labels
missing_keywords = ['refused', 'don\'t know', 'not applicable', 'legitimate skip', 
                    'missing', 'skip', 'na', 'dk', 'rf']

def extract_missing_codes(value_labels):
    """
    Extract codes that represent missing values based on label text.
    
    Parameters:
    -----------
    value_labels : dict
        Dictionary mapping numeric codes to their text labels
        
    Returns:
    --------
    list : List of codes that represent missing values
    """
    missing_codes = []
    for code, label in value_labels.items():
        label_lower = str(label).lower()
        # Check if any missing keyword appears in the label
        if any(keyword in label_lower for keyword in missing_keywords):
            missing_codes.append(code)
    return missing_codes

# Create a copy to avoid modifying the original dataframe
select_vars_data = select_vars_data.copy()

# Iterate through all selected variables
for col in ordered_columns:
    
    # Skip the ID column - we don't want to mark any IDs as missing
    if col == 'AID':
        continue
    
    # --- Method 1: Extract missing codes from value labels ---
    # This handles cases where missing values are defined in the codebook labels
    value_labels = meta.variable_value_labels.get(col, {})
    
    if value_labels:
        # Extract codes that represent missing values
        missing_codes = extract_missing_codes(value_labels)
        
        if missing_codes:
            # Create replacement dictionary mapping missing codes to NaN
            clean_dict = {code: np.nan for code in missing_codes}
            select_vars_data[col] = select_vars_data[col].replace(clean_dict)
    
    # --- Method 2: Exact missing values from metadata ---
    # This handles cases where missing values are explicitly defined in metadata
    missing_vals = meta.missing_user_values.get(col, [])
    
    if missing_vals:
        # Create dictionary mapping each missing value to NaN
        clean_dict = {int(val): np.nan for val in missing_vals}
        # Replace the missing value codes with NaN
        select_vars_data[col] = select_vars_data[col].replace(clean_dict)

    # --- Method 3: Range-based missingness from metadata ---
    # This handles cases where missing values fall within a specified range (e.g., 96-99)
    missing_ranges = meta.missing_ranges.get(col, [])

    for lo, hi in missing_ranges:
        lo, hi = int(lo), int(hi)
        # Mask all values within the range [lo, hi] as NaN
        select_vars_data[col] = select_vars_data[col].mask(
            select_vars_data[col].between(lo, hi), np.nan
        )

# Display summary of missing values after cleaning 
print(f"Total missing values across all columns: {select_vars_data.isna().sum().sum()}")
print(f"\nMissing values by column:")
print(select_vars_data.isna().sum())

Total missing values across all columns: 28341

Missing values by column:
AID           0
H1GI1M        3
H1GI1Y        3
BIO_SEX       1
H1GI9        79
IYEAR         0
H1SU1      5683
H1SU2      6274
H1RM1       427
H1RF1      2010
H1FS1        22
H1FS2        17
H1FS3        24
H1FS4        21
H1FS5        19
H1FS6        20
H1FS7        17
H1FS8        29
H1FS9        27
H1FS10       17
H1FS11       15
H1FS12       19
H1FS13       19
H1FS14       15
H1FS15       18
H1FS16       14
H1FS17       18
H1FS18       20
H1GH18        8
H1GH21        8
H1PF16       62
H1WP10      374
H1PF1       385
H1PF2       386
H1PF3       384
H1PF4       382
H1WP9       375
H1WP14     1957
H1PF23     1963
H1PF24     1965
H1PF25     1965
H1WP13     1957
H1WP1       151
H1WP2       142
H1WP3       141
H1WP4       141
H1WP5       142
H1WP6       141
H1WP7       139
H1FV1        53
H1FV2        49
H1FV3        48
H1FV4        47
H1FV5        49
H1FV6        50
H1FV7        46
dtype: int64


In [184]:
select_vars_data.head()

Unnamed: 0,AID,H1GI1M,H1GI1Y,BIO_SEX,H1GI9,IYEAR,H1SU1,H1SU2,H1RM1,H1RF1,...,H1WP5,H1WP6,H1WP7,H1FV1,H1FV2,H1FV3,H1FV4,H1FV5,H1FV6,H1FV7
0,57100270,10.0,77.0,2.0,2.0,95.0,,,8.0,8.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
1,57101310,11.0,76.0,2.0,2.0,95.0,,,1.0,7.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,57103171,10.0,79.0,1.0,1.0,95.0,,,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,57103869,1.0,77.0,1.0,2.0,95.0,1.0,,2.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,57104553,6.0,76.0,2.0,2.0,95.0,1.0,,4.0,,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now it turns out that I need to do some reverse-coding for some categories:

In [185]:
# Reverse code CES-D items (0–3 scale)
cesd_reverse_items = ["H1FS4", "H1FS8", "H1FS11", "H1FS15"]
for col in cesd_reverse_items:
    select_vars_data[col + "_r"] = 3 - select_vars_data[col]

# Reverse code impulsivity (1–5 scale)
select_vars_data["H1PF16_r"] = 6 - select_vars_data["H1PF16"]


In [186]:
print(select_vars_data[["H1FS4", "H1FS4_r"]].head())
select_vars_data[["H1PF16", "H1PF16_r"]].head()


   H1FS4  H1FS4_r
0   3.00     0.00
1   2.00     1.00
2   3.00     0.00
3   0.00     3.00
4   1.00     2.00


Unnamed: 0,H1PF16,H1PF16_r
0,4.0,2.0
1,2.0,4.0
2,5.0,1.0
3,5.0,1.0
4,2.0,4.0


# Calculation of indices:
I will compress the data corresponding to questions about one category into one index, then I will omitt the categories used for the computations

In [187]:
# 1. CES-D DEPRESSION SCORE (mean of 18 items)

cesd_items = [
    "H1FS1", "H1FS2", "H1FS3", "H1FS4_r", "H1FS5", "H1FS6",
    "H1FS7", "H1FS8_r", "H1FS9", "H1FS10", "H1FS11_r",
    "H1FS12", "H1FS13", "H1FS14", "H1FS15_r", "H1FS16",
    "H1FS17", "H1FS18"
]

select_vars_data["cesd_score"] = select_vars_data[cesd_items].mean(axis=1)


In [188]:
# impulsivity index:
select_vars_data["impulsivity"] = select_vars_data["H1PF16_r"]


In [189]:
# parents-present index:

select_vars_data["mother_present"] = (
    ~select_vars_data["H1RM1"].isin([97, np.nan])
).astype(int)

select_vars_data["father_present"] = (
    ~select_vars_data["H1RF1"].isin([97, 99, np.nan])
).astype(int)

In [190]:
warmth_items = [
    # Mother warmth/connection
    "H1WP10", "H1PF1", "H1PF2", "H1PF3", "H1PF4", "H1WP9",
    
    # Father warmth/connection
    "H1WP14", "H1PF23", "H1PF24", "H1PF25", "H1WP13"
]
# parental warmth index:
select_vars_data["parental_warmth"] = select_vars_data[warmth_items].mean(axis=1)


In [191]:
monitoring_items = [
    "H1WP1", "H1WP2", "H1WP3", "H1WP4",
    "H1WP5", "H1WP6", "H1WP7"
]
# parental monitoring index:
select_vars_data["parental_monitoring"] = select_vars_data[monitoring_items].mean(axis=1)


In [192]:
ace_items_victim = [
    "H1FV1", "H1FV2", "H1FV3", "H1FV4", "H1FV5", "H1FV6"
]
# index showing the vicimization to adverse childhood experiences
select_vars_data["ace_victim"] = (
    select_vars_data[ace_items_victim]
    .replace({1: 1, 2: 1})
    .sum(axis=1)
)


In [193]:
select_vars_data["suicidal_ideation"] = (
    select_vars_data["H1SU1"] == 1
).astype(int)

# binary value --> returns 1 if the respondent has considered suicide, 0 otherwise


In [194]:
select_vars_data["suicide_attempt"] = (
    select_vars_data["H1SU2"] > 0
).astype(int)

# binary value --> returns 1 if the respondent attempted suicide atleast once, 0 otherwise

In [195]:
# Compute simple age (year-level)
select_vars_data["age"] = (
    select_vars_data["IYEAR"] - select_vars_data["H1GI1Y"]
)

Renaming Columns and Annotating Categorical-corresponding Values:

In [None]:
rename_dict = {
    "AID": "id",
    "BIO_SEX": "sex",
    "H1GI9": "race",
    "IYEAR": "interview_year",
    "H1RM1": "mother_edu",
    "H1RF1": "father_edu",
    "H1SU1": "suicidal_thought_raw",
    "H1SU2": "suicide_attempt_raw",
    "H1GI1Y": "birth_year",     
    "H1GI1M": "birth_month"     
}

select_vars_data = select_vars_data.rename(columns=rename_dict)

# annotate sex

select_vars_data["sex_cat"] = select_vars_data["sex"].map({
    1.0: "Male",
    2.0: "Female"
})


# annotate race

select_vars_data["race_cat"] = select_vars_data["race"].map({
    1.0: "White",
    2.0: "Black",
    3.0: "Native American",
    4.0: "Asian/Pacific Islander",
    5.0: "Other"
})


# annnotate parent education


mother_map = {
    1: "8th grade or less",
    2: "Some high school",
    3: "Trade school (no HS)",
    4: "High school graduate",
    5: "GED",
    6: "Trade school (after HS)",
    7: "Some college",
    8: "College graduate",
    9: "Postgraduate",
    10: "Never attended school"
}

select_vars_data["mother_edu_cat"] = select_vars_data["mother_edu"].map(mother_map)

father_map = mother_map.copy()
father_map[10] = "Never attended school"

select_vars_data["father_edu_cat"] = select_vars_data["father_edu"].map(father_map)


# annotate suicidality (raw items)

select_vars_data["suicidal_thought_cat"] = select_vars_data["suicidal_thought_raw"].map({
    0.0: "No",
    1.0: "Yes"
})

select_vars_data["suicide_attempt_cat"] = select_vars_data["suicide_attempt_raw"].map({
    0.0: "None",
    1.0: "1 time",
    2.0: "2-3 times",
    3.0: "4-5 times",
    4.0: "6+ times"
})


Drop columns (Those used for computing indices)

In [197]:
columns_to_drop = [
    # raw demographics (keeping only categorical versions)
    "sex",
    "race",
    "mother_edu",
    "father_edu",

    # raw suicidality
    "suicidal_thought_raw",
    "suicide_attempt_raw",

    # CES-D raw items
    *[f"H1FS{i}" for i in range(1, 19)],

    # CES-D reverse-coded items
    "H1FS4_r", "H1FS8_r", "H1FS11_r", "H1FS15_r",

    # impulsivity raw item
    "H1PF16",

    # parental warmth raw items
    "H1PF1", "H1PF2", "H1PF3", "H1PF4",
    "H1PF23", "H1PF24", "H1PF25",

    # parent closeness/caring (raw)
    "H1WP9", "H1WP10", "H1WP13", "H1WP14",

    # autonomy/monitoring items
    "H1WP1", "H1WP2", "H1WP3", "H1WP4",
    "H1WP5", "H1WP6", "H1WP7",

    # violence raw items
    "H1FV1","H1FV2","H1FV3","H1FV4","H1FV5","H1FV6","H1FV7",

    # remove categorical suicidality if not needed
    "suicidal_thought_cat",
    "suicide_attempt_cat",

    # also variables corresponding to the impulsivity question, insomenia, and crying question
    "H1PF16_r",
    "H1GH18",
    "H1GH21"
]

select_vars_data = select_vars_data.drop(columns=columns_to_drop, errors="ignore")


In [198]:
select_vars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6504 entries, 0 to 6503
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   6504 non-null   object 
 1   birth_month          6501 non-null   float64
 2   birth_year           6501 non-null   float64
 3   interview_year       6504 non-null   float64
 4   cesd_score           6491 non-null   float64
 5   impulsivity          6442 non-null   float64
 6   mother_present       6504 non-null   int64  
 7   father_present       6504 non-null   int64  
 8   parental_warmth      6370 non-null   float64
 9   parental_monitoring  6367 non-null   float64
 10  ace_victim           6504 non-null   float64
 11  suicidal_ideation    6504 non-null   int64  
 12  suicide_attempt      6504 non-null   int64  
 13  age                  6501 non-null   float64
 14  sex_cat              6503 non-null   object 
 15  race_cat             6425 non-null   o

In [199]:
select_vars_data.head()

Unnamed: 0,id,birth_month,birth_year,interview_year,cesd_score,impulsivity,mother_present,father_present,parental_warmth,parental_monitoring,ace_victim,suicidal_ideation,suicide_attempt,age,sex_cat,race_cat,mother_edu_cat,father_edu_cat
0,57100270,10.0,77.0,95.0,0.5,2.0,1,1,2.82,0.71,1.0,0,0,18.0,Female,Black,College graduate,College graduate
1,57101310,11.0,76.0,95.0,0.61,4.0,1,1,2.55,1.0,0.0,0,0,19.0,Female,Black,8th grade or less,Some college
2,57103171,10.0,79.0,95.0,0.06,1.0,1,1,2.55,0.14,0.0,0,0,16.0,Male,White,High school graduate,Some high school
3,57103869,1.0,77.0,95.0,1.22,1.0,1,0,2.83,0.14,2.0,1,0,18.0,Male,Black,Some high school,
4,57104553,6.0,76.0,95.0,0.44,4.0,1,0,2.83,1.0,0.0,1,0,19.0,Female,Black,High school graduate,
