In [2]:
import pandas as pd
import numpy as np
import openpyxl

In [3]:
# Cross-sectional GSS data processing
# Load GSS overall data
gss = pd.read_excel("/Users/wooyongjung/WJ_Projects/LLM_POC_Study_2025_v2/data/GSS/GSS.xlsx")

# Create yearid by combining year and id
gss['yearid'] = gss['year'].astype(str) + "_" + gss['id_'].astype(str)

# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS = ["abdefect","abnomore","abany","abhlth","abpoor","abrape","absingle"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

# Following Rossi Scale

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", binary_items=None):
    """
    Collapse 7 binary GSS abortion items (Rossi Scale) into a single 4-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att4'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["abortion_att4"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, "year", "n_yes", "abortion_att4"]]


# Construct binary items and collapse into attitude categories
gss_abt_bin = prepare_binary(gss)
gss_abt_bin = collapse_abortion_attitudes(gss_abt_bin, binary_items=[col+"_bin" for col in BINARY_ITEMS])

# Required columns (demographic) from gss_2020
required_columns = ["cohort", "degree", "race", "sex", "wtssps"]

# Merge demographic columns
gss_abt_cs = gss_abt_bin.merge(gss[required_columns + ["yearid"]], on="yearid", how="left")

# Create generation column based on birth year
def determine_generation(year):
    if pd.isna(year):
        return None
    elif year.startswith('.'):
        return None
    year = int(year)
    if year <= 1945:
        return "Silent Generation"
    elif 1946 <= year <= 1964:
        return "Baby Boomer"
    elif 1965 <= year <= 1980:
        return "Generation X"
    elif 1981 <= year <= 1996:
        return "Millennial"
    elif 1997 <= year <= 2012:
        return "Generation Z"
    else:
        return "Generation Alpha"

gss_abt_cs['generation'] = gss_abt_cs['cohort'].apply(determine_generation)


# Create education level column
def categorize_education(edu):
    if pd.isna(edu):
        return None
    # if edu in ["Less than high school", "High school"], return "Less or equal to high school"
    # if edu in ["Associate/junior college", "Bachelor's"], return "Associate or Bachelor's Degree"
    # if edu in ["Graduate"], return "Graduate Degree"
    elif edu in ["Less than high school", "High school"]:
        return "Less or equal to high school"
    elif edu in ["Associate/junior college", "Bachelor's"]:
        return "Associate or Bachelor's Degree"
    elif edu in ["Graduate"]:
        return "Graduate Degree"
    else:
        return None 

gss_abt_cs['edu_level'] = gss_abt_cs['degree'].apply(categorize_education)

# Create gender column
def categorize_gender(gender):
    if pd.isna(gender):
        return None
    elif gender == "MALE":
        return "Male"
    elif gender == "FEMALE":
        return "Female"
    else:
        return "Other"

gss_abt_cs['gender'] = gss_abt_cs['sex'].apply(categorize_gender)

# Create race column
def categorize_race(race):
    if pd.isna(race):
        return None
    elif race == "White":
        return "White"
    elif race == "Black":
        return "Black"
    elif race == "Other":
        return "Other"
    else:
        return None

gss_abt_cs['race'] = gss_abt_cs['race'].apply(categorize_race)

In [None]:
# Panel data processing
# Load panel data (three-wave rollings)
gss_2010 = pd.read_stata('data/GSS/GSS_panel2010w123_R6.dta', convert_categoricals=False)
gss_2008 = pd.read_stata('data/GSS/GSS_panel2008w123_r6 .dta', convert_categoricals=False)
gss_2006 = pd.read_stata('data/GSS/GSS_panel2006w123_r6a.dta', convert_categoricals=False)

# Create a "yearid" column by combining "2010" and index
gss_2010['yearid'] = '2010_' + gss_2010.index.astype(str)
gss_2008['yearid'] = '2008_' + gss_2008.index.astype(str)
gss_2006['yearid'] = '2006_' + gss_2006.index.astype(str)

# Select relevant columns for analysis
required_columns = ["cohort_1", "degree_1", "degree_2", "degree_3", "race_1", "sex_1"]
abortion_items = ["abdefect_1", "abnomore_1", "abany_1", "abhlth_1", "abpoor_1", "abrape_1", "absingle_1",
                 "abdefect_2", "abnomore_2", "abany_2", "abhlth_2", "abpoor_2", "abrape_2", "absingle_2",
                 "abdefect_3", "abnomore_3", "abany_3", "abhlth_3", "abpoor_3", "abrape_3", "absingle_3"]

gss_2010 = gss_2010[['yearid'] +required_columns + abortion_items]
gss_2008 = gss_2008[['yearid'] + required_columns + abortion_items]
gss_2006 = gss_2006[['yearid'] + required_columns + abortion_items]

# Rename columns for consistency
gss_2010.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)
gss_2008.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)
gss_2006.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)

  gss_2010['yearid'] = '2010_' + gss_2010.index.astype(str)
  gss_2008['yearid'] = '2008_' + gss_2008.index.astype(str)
  gss_2006['yearid'] = '2006_' + gss_2006.index.astype(str)


In [43]:
gss_2010

Unnamed: 0,yearid,cohort,degree_1,degree_2,degree_3,race,gender,abdefect_1,abnomore_1,abany_1,...,abpoor_2,abrape_2,absingle_2,abdefect_3,abnomore_3,abany_3,abhlth_3,abpoor_3,abrape_3,absingle_3
0,2010_0,1979.0,3,1.0,3.0,3,1,,,,...,,,,,,,,,,
1,2010_1,1987.0,3,3.0,4.0,1,2,,,,...,,,,,,,,,,
2,2010_2,1939.0,0,0.0,0.0,2,2,1.0,2.0,2.0,...,2.0,1.0,1.0,1.0,1.0,2.0,,,1.0,2.0
3,2010_3,1928.0,0,0.0,,1,2,,,,...,,,,,,,,,,
4,2010_4,1932.0,0,,,2,2,,2.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2039,2010_2039,1948.0,3,3.0,,1,1,2.0,2.0,2.0,...,2.0,1.0,2.0,,,,,,,
2040,2010_2040,1944.0,1,1.0,1.0,1,2,,,,...,,,,,,,,,,
2041,2010_2041,1956.0,1,2.0,1.0,1,2,,,,...,,,,,,,,,,
2042,2010_2042,1953.0,3,3.0,3.0,1,2,1.0,2.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [64]:
# Convert dfs to long format: _1 → 2010, _2 → 2012, _3 → 2014
gss_2010_long = pd.wide_to_long(gss_2010, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()
gss_2008_long = pd.wide_to_long(gss_2008, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()
gss_2006_long = pd.wide_to_long(gss_2006, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()

# Create year column based on wave
def map_wave_to_year(wave, base_year):
    if wave == "_1":
        return base_year
    elif wave == "_2":
        return base_year + 2
    elif wave == "_3":
        return base_year + 4
    else:
        return None

gss_2010_long['year'] = gss_2010_long['wave'].apply(lambda w: map_wave_to_year(w, 2010))
gss_2008_long['year'] = gss_2008_long['wave'].apply(lambda w: map_wave_to_year(w, 2008))
gss_2006_long['year'] = gss_2006_long['wave'].apply(lambda w: map_wave_to_year(w, 2006))

In [65]:
gss_2010_long[gss_2010_long['yearid'] == '2010_2']

Unnamed: 0,yearid,wave,cohort,gender,race,abdefect,abnomore,abany,abhlth,abpoor,abrape,absingle,degree,year
2,2010_2,_1,1939.0,2,2,1.0,2.0,2.0,,2.0,1.0,2.0,0.0,2010
2046,2010_2,_2,1939.0,2,2,1.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,2012
4090,2010_2,_3,1939.0,2,2,1.0,1.0,2.0,,,1.0,2.0,0.0,2014


In [67]:
# Combine long dfs
gss_abt_panel = pd.concat([gss_2010_long, gss_2008_long, gss_2006_long], ignore_index=True)
gss_abt_panel.shape

(18201, 14)

In [68]:
# Load 2020 panel data
gss_2020 = pd.read_stata('data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)
gss_2020.head()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  gss_2020 = pd.read_stata('data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)


Unnamed: 0,samptype,yearid,fileversion,panstat,wtssall_1a,wtssall_1b,wtssall_2,wtssnr_1a,wtssnr_1b,wtssnr_2,...,sprtlrgr_2,sprtpurp_2,poltrtblk_2,poltrthsp_2,defund_2,strvbias_2,wrycovid_2,wrypaybills_2,wrygetsick_2,anesid
0,2016,20160001,GSS 2020 Panel Release 1 (May 2021),1,0.956994,,1.085009,1.260478,,1.443929,...,7.0,7.0,2.0,4.0,2.0,2.0,3.0,4.0,3.0,169657.0
1,2016,20160002,GSS 2020 Panel Release 1 (May 2021),1,0.478497,,0.542504,0.630239,,0.721964,...,7.0,7.0,4.0,4.0,1.0,2.0,4.0,4.0,3.0,169664.0
2,2016,20160003,GSS 2020 Panel Release 1 (May 2021),0,0.956994,,,1.260478,,,...,,,,,,,,,,
3,2016,20160004,GSS 2020 Panel Release 1 (May 2021),1,1.913987,,2.170018,2.520956,,2.887858,...,5.0,4.0,4.0,4.0,2.0,1.0,3.0,4.0,4.0,
4,2016,20160005,GSS 2020 Panel Release 1 (May 2021),0,1.43549,,,1.890717,,,...,,,,,,,,,,


In [71]:
gss_2020[['degree_1a', 'degree_1b', 'degree_2']]

Unnamed: 0,degree_1a,degree_1b,degree_2
0,3.0,,3.0
1,1.0,,1.0
2,3.0,,
3,1.0,,1.0
4,4.0,,
...,...,...,...
5210,,1.0,
5211,,1.0,1.0
5212,,1.0,1.0
5213,,1.0,


In [79]:
# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS_2016 = ["abdefect_1a","abnomore_1a","abany_1a","abhlth_1a","abpoor_1a","abrape_1a","absingle_1a"]
BINARY_ITEMS_2018 = ["abdefect_1b","abnomore_1b","abany_1b","abhlth_1b","abpoor_1b","abrape_1b","absingle_1b"]
BINARY_ITEMS_2020 = ["abdefect_2","abnomore_2","abany_2","abhlth_2","abpoor_2","abrape_2","absingle_2"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary_2016(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2016:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

def prepare_binary_2018(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2018:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

def prepare_binary_2020(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS_2020:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", wave_col="samptype", binary_items=None):
    """
    Collapse 7 binary GSS abortion items (Rossi Scale) into a single 4-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att4'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["abortion_att4"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, wave_col, "n_yes", "abortion_att4"]]


In [80]:
gss_2016 = gss_2020[gss_2020['samptype'] == 2016]
gss_2018 = gss_2020[gss_2020['samptype'] == 2018]

gss_2016_bin = prepare_binary_2016(gss_2016)
gss_2018_bin = prepare_binary_2018(gss_2018)
gss_2020_bin = prepare_binary_2020(gss_2020)

gss_2016_bin = collapse_abortion_attitudes(gss_2016_bin, binary_items=[col+"_bin" for col in BINARY_ITEMS_2016])
gss_2018_bin = collapse_abortion_attitudes(gss_2018_bin, binary_items=[col+"_bin" for col in BINARY_ITEMS_2018])
gss_2020_bin = collapse_abortion_attitudes(gss_2020_bin, binary_items=[col+"_bin" for col in BINARY_ITEMS_2020])

In [82]:
gss_2020_bin

Unnamed: 0,yearid,samptype,n_yes,abortion_att4
0,20160001,2016,7.0,strong_pro
1,20160002,2016,0.0,strong_anti
2,20160003,2016,0.0,strong_anti
3,20160004,2016,2.0,anti
4,20160005,2016,0.0,strong_anti
...,...,...,...,...
5210,20182344,2018,0.0,strong_anti
5211,20182345,2018,3.0,anti
5212,20182346,2018,3.0,anti
5213,20182347,2018,0.0,strong_anti


In [83]:
# Rename columns for clarity
gss_2016_bin = gss_2016_bin.rename(columns={"n_yes":"n_yes_2016", "abortion_att4":"att5_2016"})
gss_2018_bin = gss_2018_bin.rename(columns={"n_yes":"n_yes_2018", "abortion_att4":"att5_2018"})
gss_2020_bin = gss_2020_bin.rename(columns={"n_yes":"n_yes_2020", "abortion_att4":"att5_2020"})

print(gss_2016_bin.shape)
print(gss_2018_bin.shape)
print(gss_2020_bin.shape)

# Merge all years into one DataFrame
gss_panel_2016 = gss_2016_bin.merge(gss_2020_bin, on="yearid", how="left")
gss_panel_2018 = gss_2018_bin.merge(gss_2020_bin, on="yearid", how="left")

(2867, 4)
(2348, 4)
(5215, 4)


In [None]:
# Create generation column based on birth year
def determine_generation(year):
    if pd.isna(year):
        return None
    year = int(year)
    if year <= 1945:
        return "Silent Generation"
    elif 1946 <= year <= 1964:
        return "Baby Boomer"
    elif 1965 <= year <= 1980:
        return "Generation X"
    elif 1981 <= year <= 1996:
        return "Millennial"
    elif 1997 <= year <= 2012:
        return "Generation Z"
    else:
        return "Generation Alpha"

gss_2010['generation'] = gss_2010['cohort_1'].apply(determine_generation)
gss_2008['generation'] = gss_2008['cohort_1'].apply(determine_generation)
gss_2006['generation'] = gss_2006['cohort_1'].apply(determine_generation)


# Create education level column
def categorize_education(edu):
    if pd.isna(edu):
        return None
    elif edu == 0:
        return "Less than High School"
    elif edu == 1 or edu == 2:
        return "High School to Associate Degree"
    elif edu == 3:
        return "Bachelor's Degree"
    elif edu == 4:
        return "Graduate Degree"
    else:
        return None

gss_2010['edu_level_1'] = gss_2010['degree_1'].apply(categorize_education)
gss_panel_2018['edu_level'] = gss_panel_2018['degree_1b'].apply(categorize_education)

# Create gender column
def categorize_gender(gender):
    if pd.isna(gender):
        return None
    elif gender == 1:
        return "Male"
    elif gender == 2:
        return "Female"
    else:
        return "Other"

gss_panel_2016['gender'] = gss_panel_2016['gender1_1a'].apply(categorize_gender)
gss_panel_2018['gender'] = gss_panel_2018['gender1_1b'].apply(categorize_gender)

# Create race column
def categorize_race(race):
    if pd.isna(race):
        return None
    elif race == 1:
        return "White"
    elif race == 2:
        return "Black"
    elif race == 3:
        return "Other"
    else:
        return "Unknown"

gss_panel_2016['race'] = gss_panel_2016['race_1a'].apply(categorize_race)
gss_panel_2018['race'] = gss_panel_2018['race_1b'].apply(categorize_race)


In [None]:
# Rename columns for consistency
gss_2014.rename(columns={
    "cohort_1": "cohort",
    "race_1": "race",
    "sex_1": "sex"}, inplace=True)

In [32]:
gss_2014_long

Unnamed: 0,yearid,wave,cohort,race,sex,degree,abdefect,abnomore,abany,abhlth,abpoor,abrape,absingle,year
0,2010_0,1,1979.0,3,1,3.0,,,,,,,,2006
1,2010_1,1,1987.0,1,2,3.0,,,,,,,,2006
2,2010_2,1,1939.0,2,2,0.0,1.0,2.0,2.0,,2.0,1.0,2.0,2006
3,2010_3,1,1928.0,1,2,0.0,,,,,,,,2006
4,2010_4,1,1932.0,2,2,0.0,,2.0,,2.0,2.0,2.0,2.0,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18196,2006_1995,3,1959.0,1,2,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2010
18197,2006_1996,3,1945.0,3,2,,,,,,,,,2010
18198,2006_1997,3,1983.0,1,2,,,,,,,,,2010
18199,2006_1998,3,1974.0,3,2,0.0,,,,,,,,2010


In [26]:
gss_2014['abany_1'].value_counts()

abany_1
2.0    2237
1.0    1615
Name: count, dtype: int64

In [74]:
# Map raw responses → 1/0 (edit as needed to match your GSS coding)
BINARY_ITEMS = ["abdefect_1", "abnomore_1", "abany_1", "abhlth_1", "abpoor_1", "abrape_1", "absingle_1",
                 "abdefect_2", "abnomore_2", "abany_2", "abhlth_2", "abpoor_2", "abrape_2", "absingle_2",
                 "abdefect_3", "abnomore_3", "abany_3", "abhlth_3", "abpoor_3", "abrape_3", "absingle_3"]

# Construct binary items and collapse into attitude categories
gss_2014_bin = prepare_binary(gss_2014)

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", binary_items=None):
    """
    Collapse 7 binary GSS abortion items (Rossi Scale) into a single 4-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att4'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["abortion_att4"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, wave_col, "n_yes", "abortion_att4"]]

In [None]:
gss_2016 = gss_2020[gss_2020['samptype'] == 2016]
gss_2018 = gss_2020[gss_2020['samptype'] == 2018]

gss_2016_bin = prepare_binary_2016(gss_2016)
gss_2018_bin = prepare_binary_2018(gss_2018)
gss_2020_bin = prepare_binary_2020(gss_2020)

gss_2016_bin = collapse_abortion_attitudes(gss_2016_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2016])
gss_2018_bin = collapse_abortion_attitudes(gss_2018_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2018])
gss_2020_bin = collapse_abortion_attitudes(gss_2020_bin, wave_col="samptype", binary_items=[col+"_bin" for col in BINARY_ITEMS_2020])

# gss_2016_bin = composite_score_binary(gss_2016_bin)
# gss_2018_bin = composite_score_binary(gss_2018_bin)
# gss_2020_bin = composite_score_binary(gss_2020_bin)

# gss_2016_bin = bin_prop_to_5(gss_2016_bin)
# gss_2018_bin = bin_prop_to_5(gss_2018_bin)
# gss_2020_bin = bin_prop_to_5(gss_2020_bin)