In [2]:
import pandas as pd
import numpy as np
import openpyxl

In [3]:
# Cross-sectional GSS data processing
# Load GSS overall data
gss = pd.read_excel("/Users/wooyongjung/WJ_Projects/LLM_POC_Study_2025_v2/data/GSS/GSS.xlsx")

# Create yearid by combining year and id
gss['yearid'] = gss['year'].astype(str) + "_" + gss['id_'].astype(str)

# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS = ["abdefect","abnomore","abany","abhlth","abpoor","abrape","absingle"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

# Following Rossi Scale

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", binary_items=None):
    """
    Collapse 7 binary GSS abortion items (Rossi Scale) into a single 4-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att4'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["abortion_att4"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, "year", "n_yes", "abortion_att4"]]


# Construct binary items and collapse into attitude categories
gss_abt_bin = prepare_binary(gss)
gss_abt_bin = collapse_abortion_attitudes(gss_abt_bin, binary_items=[col+"_bin" for col in BINARY_ITEMS])

# Required columns (demographic) from gss_2020
required_columns = ["cohort", "degree", "race", "sex", "wtssps"]

# Merge demographic columns
gss_abt_cs = gss_abt_bin.merge(gss[required_columns + ["yearid"]], on="yearid", how="left")

# Create generation column based on birth year
def determine_generation(year):
    if pd.isna(year):
        return None
    elif year.startswith('.'):
        return None
    year = int(year)
    if year <= 1945:
        return "Silent Generation"
    elif 1946 <= year <= 1964:
        return "Baby Boomer"
    elif 1965 <= year <= 1980:
        return "Generation X"
    elif 1981 <= year <= 1996:
        return "Millennial"
    elif 1997 <= year <= 2012:
        return "Generation Z"
    else:
        return "Generation Alpha"

gss_abt_cs['generation'] = gss_abt_cs['cohort'].apply(determine_generation)


# Create education level column
def categorize_education(edu):
    if pd.isna(edu):
        return None
    # if edu in ["Less than high school", "High school"], return "Less or equal to high school"
    # if edu in ["Associate/junior college", "Bachelor's"], return "Associate or Bachelor's Degree"
    # if edu in ["Graduate"], return "Graduate Degree"
    elif edu in ["Less than high school", "High school"]:
        return "Less or equal to high school"
    elif edu in ["Associate/junior college", "Bachelor's"]:
        return "Associate or Bachelor's Degree"
    elif edu in ["Graduate"]:
        return "Graduate Degree"
    else:
        return None 

gss_abt_cs['edu_level'] = gss_abt_cs['degree'].apply(categorize_education)

# Create gender column
def categorize_gender(gender):
    if pd.isna(gender):
        return None
    elif gender == "MALE":
        return "Male"
    elif gender == "FEMALE":
        return "Female"
    else:
        return "Other"

gss_abt_cs['gender'] = gss_abt_cs['sex'].apply(categorize_gender)

# Create race column
def categorize_race(race):
    if pd.isna(race):
        return None
    elif race == "White":
        return "White"
    elif race == "Black":
        return "Black"
    elif race == "Other":
        return "Other"
    else:
        return None

gss_abt_cs['race'] = gss_abt_cs['race'].apply(categorize_race)

In [4]:
# Panel GSS data processing
# Load Stata .dta file and convert to CSV
gss_2020 = pd.read_stata('Data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  gss_2020 = pd.read_stata('Data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)


In [None]:
# Load panel data
gss_2010 = pd.read_stata('data/GSS/GSS_panel2010w123_R6.dta', convert_categoricals=False)
gss_2008 = pd.read_stata('data/GSS/GSS_panel2008w123_r6 .dta', convert_categoricals=False)
gss_2006 = pd.read_stata('data/GSS/GSS_panel2006w123_r6a.dta', convert_categoricals=False)

In [7]:
gss_2010.head()

Unnamed: 0,oversamp,sampcode,sample,form,formwt,vpsu,vstrat,samptype,wtpan12,WTPAN123,...,spsei10inc_3,pasei10inc_1,pasei10inc_2,pasei10inc_3,masei10inc_1,masei10inc_2,masei10inc_3,letin1a_1,letin1a_2,letin1a_3
0,1,501,9,2,1,1,2240,2010,0.865083,0.869498,...,,4.1,4.1,4.1,21.6,56.9,5.5,3.0,4.0,1.0
1,1,501,9,1,1,1,2240,2010,0.475796,0.478224,...,,66.2,91.9,90.5,82.0,,60.8,2.0,2.0,2.0
2,1,501,9,1,1,2,2240,2010,0.475796,0.478224,...,,,,,,,4.5,4.0,4.0,4.0
3,1,501,9,2,1,2,2240,2010,0.475796,,...,,7.7,5.5,,,,,3.0,3.0,
4,1,501,9,1,1,1,2241,2010,,,...,,40.3,,,,,,,,


In [9]:
# Check columns including "ab"
for col in gss_2010.columns:
    if "year" in col:
        print(col)

vetyears_1
vetyears_2
vetyears_3
year_1
year_2
year_3
yearsjob_1
yearsjob_2
yearsjob_3
yearval_1
yearval_2
yearval_3


In [16]:
# Create a "yearid" column by combining "2010" and index
gss_2010['yearid'] = '2010_' + gss_2010.index.astype(str)
gss_2008['yearid'] = '2008_' + gss_2008.index.astype(str)
gss_2006['yearid'] = '2006_' + gss_2006.index.astype(str)

  gss_2010['yearid'] = '2010_' + gss_2010.index.astype(str)
  gss_2008['yearid'] = '2008_' + gss_2008.index.astype(str)
  gss_2006['yearid'] = '2006_' + gss_2006.index.astype(str)


In [18]:
gss_2010[['sex_1', 'sex_2', 'sex_3']]

Unnamed: 0,sex_1,sex_2,sex_3
0,1,1.0,1.0
1,2,2.0,2.0
2,2,2.0,2.0
3,2,2.0,
4,2,,
...,...,...,...
2039,1,1.0,
2040,2,2.0,2.0
2041,2,2.0,2.0
2042,2,2.0,2.0


In [None]:
# Select relevant columns for analysis
required_columns = ["cohort_1", "degree_1", "degree_2", "degree_3", "race_1", "gender1_1a"]


gss_2010[['degree_1', 'degree_2', 'degree_3']]

Unnamed: 0,degree_1,degree_2,degree_3
0,3,1.0,3.0
1,3,3.0,4.0
2,0,0.0,0.0
3,0,0.0,
4,0,,
...,...,...,...
2039,3,3.0,
2040,1,1.0,1.0
2041,1,2.0,1.0
2042,3,3.0,3.0


In [5]:
gss_2020.head()

Unnamed: 0,samptype,yearid,fileversion,panstat,wtssall_1a,wtssall_1b,wtssall_2,wtssnr_1a,wtssnr_1b,wtssnr_2,...,sprtlrgr_2,sprtpurp_2,poltrtblk_2,poltrthsp_2,defund_2,strvbias_2,wrycovid_2,wrypaybills_2,wrygetsick_2,anesid
0,2016,20160001,GSS 2020 Panel Release 1 (May 2021),1,0.956994,,1.085009,1.260478,,1.443929,...,7.0,7.0,2.0,4.0,2.0,2.0,3.0,4.0,3.0,169657.0
1,2016,20160002,GSS 2020 Panel Release 1 (May 2021),1,0.478497,,0.542504,0.630239,,0.721964,...,7.0,7.0,4.0,4.0,1.0,2.0,4.0,4.0,3.0,169664.0
2,2016,20160003,GSS 2020 Panel Release 1 (May 2021),0,0.956994,,,1.260478,,,...,,,,,,,,,,
3,2016,20160004,GSS 2020 Panel Release 1 (May 2021),1,1.913987,,2.170018,2.520956,,2.887858,...,5.0,4.0,4.0,4.0,2.0,1.0,3.0,4.0,4.0,
4,2016,20160005,GSS 2020 Panel Release 1 (May 2021),0,1.43549,,,1.890717,,,...,,,,,,,,,,


In [None]:
# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS_2016 = ["abdefect_1a","abnomore_1a","abany_1a","abhlth_1a","abpoor_1a","abrape_1a","absingle_1a"]
BINARY_ITEMS_2018 = ["abdefect_1b","abnomore_1b","abany_1b","abhlth_1b","abpoor_1b","abrape_1b","absingle_1b"]
BINARY_ITEMS_2020 = ["abdefect_2","abnomore_2","abany_2","abhlth_2","abpoor_2","abrape_2","absingle_2"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary_2016(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2016:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

def prepare_binary_2018(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2018:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

def prepare_binary_2020(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2020:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

# Split data by wave
gss_2016 = gss_2020[gss_2020['samptype'] == 2016]
gss_2018 = gss_2020[gss_2020['samptype'] == 2018]

# Construct binary items and collapse into attitude categories
gss_2016_bin = prepare_binary_2016(gss_2016)
gss_2018_bin = prepare_binary_2018(gss_2018)
gss_2020_bin = prepare_binary_2020(gss_2020)

gss_2016_bin = collapse_abortion_attitudes(gss_2016_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2016])
gss_2018_bin = collapse_abortion_attitudes(gss_2018_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2018])
gss_2020_bin = collapse_abortion_attitudes(gss_2020_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2020])