In [1]:
import pandas as pd
import numpy as np

In [3]:
# Load Stata .dta file and convert to CSV
gss_2020 = pd.read_stata('Data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  gss_2020 = pd.read_stata('Data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)


In [5]:
gss_2020.head()

Unnamed: 0,samptype,yearid,fileversion,panstat,wtssall_1a,wtssall_1b,wtssall_2,wtssnr_1a,wtssnr_1b,wtssnr_2,...,sprtlrgr_2,sprtpurp_2,poltrtblk_2,poltrthsp_2,defund_2,strvbias_2,wrycovid_2,wrypaybills_2,wrygetsick_2,anesid
0,2016,20160001,GSS 2020 Panel Release 1 (May 2021),1,0.956994,,1.085009,1.260478,,1.443929,...,7.0,7.0,2.0,4.0,2.0,2.0,3.0,4.0,3.0,169657.0
1,2016,20160002,GSS 2020 Panel Release 1 (May 2021),1,0.478497,,0.542504,0.630239,,0.721964,...,7.0,7.0,4.0,4.0,1.0,2.0,4.0,4.0,3.0,169664.0
2,2016,20160003,GSS 2020 Panel Release 1 (May 2021),0,0.956994,,,1.260478,,,...,,,,,,,,,,
3,2016,20160004,GSS 2020 Panel Release 1 (May 2021),1,1.913987,,2.170018,2.520956,,2.887858,...,5.0,4.0,4.0,4.0,2.0,1.0,3.0,4.0,4.0,
4,2016,20160005,GSS 2020 Panel Release 1 (May 2021),0,1.43549,,,1.890717,,,...,,,,,,,,,,


In [8]:
# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS_2016 = ["abdefect_1a","abnomore_1a","abany_1a","abhlth_1a","abpoor_1a","abrape_1a","absingle_1a"]
BINARY_ITEMS_2018 = ["abdefect_1b","abnomore_1b","abany_1b","abhlth_1b","abpoor_1b","abrape_1b","absingle_1b"]
BINARY_ITEMS_2020 = ["abdefect_2","abnomore_2","abany_2","abhlth_2","abpoor_2","abrape_2","absingle_2"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary_2016(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2016:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

def prepare_binary_2018(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2018:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

def prepare_binary_2020(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2020:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", wave_col="samptype", binary_items=None):
    """
    Collapse 7 binary GSS abortion items into a single 5-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att5'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes == 4:
            return "neutral"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["att5"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, wave_col, "n_yes", "att5"]]

# def composite_score_binary(gss_bin: pd.DataFrame, id_col="yearid", wave_col="samptype"):
#     # compute per-row mean of available items
#     item_cols = [c for c in gss_bin.columns if c.endswith("_bin")]
#     df = gss_bin[[id_col, wave_col] + item_cols].copy()
#     df["prop_yes"] = df[item_cols].mean(axis=1, skipna=True)
#     # z-score within wave to stabilize across waves
#     df["prop_yes_z"] = df.groupby(wave_col)["prop_yes"].transform(lambda s: (s - s.mean())/(s.std(ddof=0)+1e-8))
#     return df[[id_col, wave_col, "prop_yes_z"]]

# def bin_prop_to_5(df_prop: pd.DataFrame, wave_col="samptype"):
#     # quantile bins by wave
#     out = df_prop.copy()
#     labels = ["strong_anti","anti","neutral","pro","strong_pro"]
#     out["att5"] = np.nan
#     for w, g in out.groupby(wave_col):
#         q = g["prop_yes_z"].quantile([0.1,0.3,0.7,0.9]).tolist()
#         def f(x):
#             if pd.isna(x): return np.nan
#             if x <= q[0]: return "strong_anti"
#             if x <= q[1]: return "anti"
#             if x <= q[2]: return "neutral"
#             if x <= q[3]: return "pro"
#             return "strong_pro"
#         out.loc[g.index, "att5"] = g["prop_yes_z"].apply(f)
#     return out


In [9]:
gss_2016 = gss_2020[gss_2020['samptype'] == 2016]
gss_2018 = gss_2020[gss_2020['samptype'] == 2018]

gss_2016_bin = prepare_binary_2016(gss_2016)
gss_2018_bin = prepare_binary_2018(gss_2018)
gss_2020_bin = prepare_binary_2020(gss_2020)

gss_2016_bin = collapse_abortion_attitudes(gss_2016_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2016])
gss_2018_bin = collapse_abortion_attitudes(gss_2018_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2018])
gss_2020_bin = collapse_abortion_attitudes(gss_2020_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2020])

# gss_2016_bin = composite_score_binary(gss_2016_bin)
# gss_2018_bin = composite_score_binary(gss_2018_bin)
# gss_2020_bin = composite_score_binary(gss_2020_bin)

# gss_2016_bin = bin_prop_to_5(gss_2016_bin)
# gss_2018_bin = bin_prop_to_5(gss_2018_bin)
# gss_2020_bin = bin_prop_to_5(gss_2020_bin)

In [10]:
# Rename columns for clarity
gss_2016_bin = gss_2016_bin.rename(columns={"n_yes":"n_yes_2016", "att5":"att5_2016"})
gss_2018_bin = gss_2018_bin.rename(columns={"n_yes":"n_yes_2018", "att5":"att5_2018"})
gss_2020_bin = gss_2020_bin.rename(columns={"n_yes":"n_yes_2020", "att5":"att5_2020"})

print(gss_2016_bin.shape)
print(gss_2018_bin.shape)
print(gss_2020_bin.shape)

# Merge all years into one DataFrame
gss_panel_2016 = gss_2016_bin.merge(gss_2020_bin, on="yearid", how="left")
gss_panel_2018 = gss_2018_bin.merge(gss_2020_bin, on="yearid", how="left")

(2867, 4)
(2348, 4)
(5215, 4)


In [11]:
# Required columns (demographic) from gss_2020
required_columns_2016 = ["cohort_1a", "degree_1a", "race_1a", "gender1_1a"]
required_columns_2018 = ["cohort_1b", "degree_1b", "race_1b", "gender1_1b"]

# Merge demographic columns
gss_panel_2016 = gss_panel_2016.merge(gss_2020[required_columns_2016 + ["yearid"]], on="yearid", how="left")
gss_panel_2018 = gss_panel_2018.merge(gss_2020[required_columns_2018 + ["yearid"]], on="yearid", how="left")

In [12]:
gss_panel_2016.head()

Unnamed: 0,yearid,samptype_x,n_yes_2016,att5_2016,samptype_y,n_yes_2020,att5_2020,cohort_1a,degree_1a,race_1a,gender1_1a
0,20160001,2016,7.0,strong_pro,2016,7.0,strong_pro,1969.0,3.0,1.0,1.0
1,20160002,2016,0.0,strong_anti,2016,0.0,strong_anti,1955.0,1.0,1.0,1.0
2,20160003,2016,2.0,anti,2016,0.0,strong_anti,1944.0,3.0,1.0,1.0
3,20160004,2016,2.0,anti,2016,2.0,anti,1973.0,1.0,1.0,1.0
4,20160005,2016,7.0,strong_pro,2016,0.0,strong_anti,1961.0,4.0,1.0,1.0


In [13]:
# Create generation column based on birth year
def determine_generation(year):
    if pd.isna(year):
        return None
    year = int(year)
    if year <= 1945:
        return "Silent Generation"
    elif 1946 <= year <= 1964:
        return "Baby Boomer"
    elif 1965 <= year <= 1980:
        return "Generation X"
    elif 1981 <= year <= 1996:
        return "Millennial"
    elif 1997 <= year <= 2012:
        return "Generation Z"
    else:
        return "Generation Alpha"

gss_panel_2016['generation'] = gss_panel_2016['cohort_1a'].apply(determine_generation)
gss_panel_2018['generation'] = gss_panel_2018['cohort_1b'].apply(determine_generation)

# Create education level column
def categorize_education(edu):
    if pd.isna(edu):
        return None
    elif edu == 0:
        return "Less than High School"
    elif edu == 1 or edu == 2:
        return "High School to Associate Degree"
    elif edu == 3:
        return "Bachelor's Degree"
    elif edu == 4:
        return "Graduate Degree"
    else:
        return None

gss_panel_2016['edu_level'] = gss_panel_2016['degree_1a'].apply(categorize_education)
gss_panel_2018['edu_level'] = gss_panel_2018['degree_1b'].apply(categorize_education)

# Create gender column
def categorize_gender(gender):
    if pd.isna(gender):
        return None
    elif gender == 1:
        return "Male"
    elif gender == 2:
        return "Female"
    else:
        return "Other"

gss_panel_2016['gender'] = gss_panel_2016['gender1_1a'].apply(categorize_gender)
gss_panel_2018['gender'] = gss_panel_2018['gender1_1b'].apply(categorize_gender)

# Create race column
def categorize_race(race):
    if pd.isna(race):
        return None
    elif race == 1:
        return "White"
    elif race == 2:
        return "Black"
    elif race == 3:
        return "Other"
    else:
        return "Unknown"

gss_panel_2016['race'] = gss_panel_2016['race_1a'].apply(categorize_race)
gss_panel_2018['race'] = gss_panel_2018['race_1b'].apply(categorize_race)


In [14]:
# Filter out unnecessary columns
gss_panel_2016 = gss_panel_2016[["yearid", "att5_2016", "att5_2020", "generation", "edu_level", "gender", "race"]]
gss_panel_2018 = gss_panel_2018[["yearid", "att5_2018", "att5_2020", "generation", "edu_level", "gender", "race"]]

# Convert to long format for analysis
gss_panel_2016_long = pd.melt(gss_panel_2016, id_vars=["yearid", "generation", "edu_level", "gender", "race"], var_name="year", value_name="att5")
gss_panel_2018_long = pd.melt(gss_panel_2018, id_vars=["yearid", "generation", "edu_level", "gender", "race"], var_name="year", value_name="att5")

# Remove "att5_" from the year column
gss_panel_2016_long["year"] = gss_panel_2016_long["year"].str.replace("att5_", "")
gss_panel_2018_long["year"] = gss_panel_2018_long["year"].str.replace("att5_", "")


In [15]:
# Save to CSV
gss_panel_2016_long.to_csv("data/GSS/gss_panel_2016_long.csv", index=False)
gss_panel_2018_long.to_csv("data/GSS/gss_panel_2018_long.csv", index=False)