In [2]:
import pandas as pd
import numpy as np
import openpyxl

In [3]:
# Cross-sectional GSS data processing
# Load GSS overall data
gss = pd.read_excel("/Users/wooyongjung/WJ_Projects/LLM_POC_Study_2025_v2/data/GSS/GSS.xlsx")

# Create yearid by combining year and id
gss['yearid'] = gss['year'].astype(str) + "_" + gss['id_'].astype(str)

# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS = ["abdefect","abnomore","abany","abhlth","abpoor","abrape","absingle"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

# Following Rossi Scale

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", binary_items=None):
    """
    Collapse 7 binary GSS abortion items (Rossi Scale) into a single 4-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att4'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Count number of non-missing responses
    out["n_nonmiss"] = out[binary_items].notna().sum(axis=1)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["abortion_att4"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, "year", "n_yes", "n_nonmiss", "abortion_att4"]]


# Construct binary items and collapse into attitude categories
gss_abt_bin = prepare_binary(gss)
gss_abt_bin = collapse_abortion_attitudes(gss_abt_bin, binary_items=[col+"_bin" for col in BINARY_ITEMS])

# Required columns (demographic) from gss_2020
required_columns = ["cohort", "degree", "race", "sex", "wtssps"]

# Merge demographic columns
gss_abt_cs = gss_abt_bin.merge(gss[required_columns + ["yearid"]], on="yearid", how="left")

# Create generation column based on birth year
def determine_generation(year):
    if pd.isna(year):
        return None
    elif year.startswith('.'):
        return None
    year = int(year)
    if year <= 1945:
        return "Silent Generation"
    elif 1946 <= year <= 1964:
        return "Baby Boomer"
    elif 1965 <= year <= 1980:
        return "Generation X"
    elif 1981 <= year <= 1996:
        return "Millennial"
    elif 1997 <= year <= 2012:
        return "Generation Z"
    else:
        return "Generation Alpha"

gss_abt_cs['generation'] = gss_abt_cs['cohort'].apply(determine_generation)


# Create education level column
def categorize_education(edu):
    if pd.isna(edu):
        return None
    elif edu in ["Less than high school", "High school"]:
        return "Less or equal to high school"
    elif edu in ["Associate/junior college", "Bachelor's"]:
        return "Associate or Bachelor's Degree"
    elif edu in ["Graduate"]:
        return "Graduate Degree"
    else:
        return None 

gss_abt_cs['edu_level'] = gss_abt_cs['degree'].apply(categorize_education)

# Create gender column
def categorize_gender(gender):
    if pd.isna(gender):
        return None
    elif gender == "MALE":
        return "Male"
    elif gender == "FEMALE":
        return "Female"
    else:
        return "Other"

gss_abt_cs['gender'] = gss_abt_cs['sex'].apply(categorize_gender)

# Create race column
def categorize_race(race):
    if pd.isna(race):
        return None
    elif race == "White":
        return "White"
    elif race == "Black":
        return "Black"
    elif race == "Other":
        return "Other"
    else:
        return None

gss_abt_cs['race'] = gss_abt_cs['race'].apply(categorize_race)

In [4]:
# Panel data processing
# Load panel data (three-wave rollings)
gss_2010 = pd.read_stata('data/GSS/GSS_panel2010w123_R6.dta', convert_categoricals=False)
gss_2008 = pd.read_stata('data/GSS/GSS_panel2008w123_r6 .dta', convert_categoricals=False)
gss_2006 = pd.read_stata('data/GSS/GSS_panel2006w123_r6a.dta', convert_categoricals=False)

# Create a "yearid" column by combining "2010" and index
gss_2010['yearid'] = '2010_' + gss_2010.index.astype(str)
gss_2008['yearid'] = '2008_' + gss_2008.index.astype(str)
gss_2006['yearid'] = '2006_' + gss_2006.index.astype(str)

# Select relevant columns for analysis
required_columns = ["cohort_1", "degree_1", "degree_2", "degree_3", "race_1", "sex_1"]
abortion_items = ["abdefect_1", "abnomore_1", "abany_1", "abhlth_1", "abpoor_1", "abrape_1", "absingle_1",
                 "abdefect_2", "abnomore_2", "abany_2", "abhlth_2", "abpoor_2", "abrape_2", "absingle_2",
                 "abdefect_3", "abnomore_3", "abany_3", "abhlth_3", "abpoor_3", "abrape_3", "absingle_3"]

gss_2010 = gss_2010[['yearid'] +required_columns + abortion_items]
gss_2008 = gss_2008[['yearid'] + required_columns + abortion_items]
gss_2006 = gss_2006[['yearid'] + required_columns + abortion_items]

# Rename columns for consistency
gss_2010.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)
gss_2008.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)
gss_2006.rename(columns={"cohort_1": "cohort", "race_1": "race", "sex_1": "gender"}, inplace=True)

  gss_2010['yearid'] = '2010_' + gss_2010.index.astype(str)
  gss_2008['yearid'] = '2008_' + gss_2008.index.astype(str)
  gss_2006['yearid'] = '2006_' + gss_2006.index.astype(str)


In [5]:
# Convert dfs to long format: _1 → 2010, _2 → 2012, _3 → 2014
gss_2010_long = pd.wide_to_long(gss_2010, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()
gss_2008_long = pd.wide_to_long(gss_2008, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()
gss_2006_long = pd.wide_to_long(gss_2006, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()

# Create year column based on wave
def map_wave_to_year(wave, base_year):
    if wave == "_1":
        return base_year
    elif wave == "_2":
        return base_year + 2
    elif wave == "_3":
        return base_year + 4
    else:
        return None

gss_2010_long['year'] = gss_2010_long['wave'].apply(lambda w: map_wave_to_year(w, 2010))
gss_2008_long['year'] = gss_2008_long['wave'].apply(lambda w: map_wave_to_year(w, 2008))
gss_2006_long['year'] = gss_2006_long['wave'].apply(lambda w: map_wave_to_year(w, 2006))

In [6]:
# Combine long dfs
gss_abt_panel = pd.concat([gss_2010_long, gss_2008_long, gss_2006_long], ignore_index=True)
gss_abt_panel.shape

(18201, 14)

In [7]:
# Load 2020 panel data
gss_2020 = pd.read_stata('data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)

# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS_2016 = ["abdefect_1a","abnomore_1a","abany_1a","abhlth_1a","abpoor_1a","abrape_1a","absingle_1a"]
BINARY_ITEMS_2018 = ["abdefect_1b","abnomore_1b","abany_1b","abhlth_1b","abpoor_1b","abrape_1b","absingle_1b"]
BINARY_ITEMS_2020 = ["abdefect_2","abnomore_2","abany_2","abhlth_2","abpoor_2","abrape_2","absingle_2"]

# Split gss_2020 into 2016->2020 and 2018->2020 waves
gss_2016 = gss_2020[gss_2020['samptype'] == 2016]
gss_2018 = gss_2020[gss_2020['samptype'] == 2018]

# Required columns (demographic) from gss_2020
required_columns_2016 = ["cohort_1a", "degree_1a", "race_1a", "gender1_1a", "degree_2"]
required_columns_2018 = ["cohort_1b", "degree_1b", "race_1b", "gender1_1b", "degree_2"]

gss_2016 = gss_2016[['yearid'] + required_columns_2016 + BINARY_ITEMS_2016 + BINARY_ITEMS_2020]
gss_2018 = gss_2018[['yearid'] + required_columns_2018 + BINARY_ITEMS_2018 + BINARY_ITEMS_2020]

# Rename 1a and 1b
gss_2016.rename(columns={"gender1_1a": "gender", "race_1a": "race", "degree_1a": "degree_1", "cohort_1a": "cohort",
"abdefect_1a": "abdefect_1", "abnomore_1a": "abnomore_1", "abany_1a": "abany_1", "abhlth_1a": "abhlth_1", "abpoor_1a": "abpoor_1", "abrape_1a": "abrape_1", "absingle_1a": "absingle_1"}, inplace=True)
gss_2018.rename(columns={"gender1_1b": "gender", "race_1b": "race", "degree_1b": "degree_1", "cohort_1b": "cohort1",
"abdefect_1b": "abdefect_1", "abnomore_1b": "abnomore_1", "abany_1b": "abany_1", "abhlth_1b": "abhlth_1", "abpoor_1b": "abpoor_1", "abrape_1b": "abrape_1", "absingle_1b": "absingle_1"}, inplace=True)

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  gss_2020 = pd.read_stata('data/GSS/GSS_panel2020_r1a.dta', convert_categoricals=False)


In [8]:
# Convert to long format for analysis
gss_2016_long = pd.wide_to_long(gss_2016, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree'], 
                                i='yearid', j='wave', suffix='_\\d+').reset_index()
gss_2018_long = pd.wide_to_long(gss_2018, stubnames=['abdefect', 'abnomore', 'abany', 'abhlth', 'abpoor', 'abrape', 'absingle','degree'],
                                i='yearid', j='wave', suffix='_\\d+').reset_index()

# Create year column based on wave
def map_wave_to_year(wave, last_year, gap):
    if wave == "_1":
        return last_year - gap
    elif wave == "_2":
        return last_year
    else:
        return None

gss_2016_long['year'] = gss_2016_long['wave'].apply(lambda w: map_wave_to_year(w, 2020, 4))
gss_2018_long['year'] = gss_2018_long['wave'].apply(lambda w: map_wave_to_year(w, 2020, 2))


In [9]:
# Combine long dfs
gss_abt_panel = pd.concat([gss_2016_long, gss_2018_long, gss_abt_panel], ignore_index=True)
gss_abt_panel.shape

(28631, 15)

In [10]:
# Expect one row per respondent per wave, columns listed below:
BINARY_ITEMS = ["abdefect","abnomore","abany","abhlth","abpoor","abrape","absingle"]

# Map raw responses → 1/0 (edit as needed to match your GSS coding)
YES_VALUES = {"yes","Yes","YES",1, "1", True}
NO_VALUES  = {"no","No","NO",2, "2", False}

def to_binary(s):
    if pd.isna(s): return np.nan
    return 1 if s in YES_VALUES else (0 if s in NO_VALUES else np.nan)

def prepare_binary(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BINARY_ITEMS:
        if col in out.columns:
            out[col+"_bin"] = out[col].apply(to_binary).astype("float")
    return out

# Following Rossi Scale

def collapse_abortion_attitudes(df: pd.DataFrame, id_col="yearid", binary_items=None):
    """
    Collapse 7 binary GSS abortion items (Rossi Scale) into a single 4-level categorical variable.
    
    - Input: df with columns abdefect ... absingle coded 0/1 (NaN allowed).
    - Output: df with new columns:
        'n_yes'  (count of yes across 7 items)
        'att4'   (collapsed attitude category)
    """
    out = df.copy()

    # Count number of "yes" responses (ignore NaN)
    out["n_yes"] = out[binary_items].sum(axis=1, skipna=True)

    # Count number of non-missing responses (for reference)
    out["n_nonmiss"] = out[binary_items].notna().sum(axis=1)

    # Map counts into categories
    def map_to_cat(n_yes):
        if pd.isna(n_yes):
            return np.nan
        n_yes = int(n_yes)
        if n_yes <= 1:
            return "strong_anti"
        elif n_yes <= 3:
            return "anti"
        elif n_yes <= 6:
            return "pro"
        else:  # n_yes == 7
            return "strong_pro"

    out["abortion_att4"] = out["n_yes"].apply(map_to_cat)

    return out[[id_col, "year", "n_yes", "n_nonmiss", "abortion_att4", "cohort", "degree", "race", "gender"]]


# Construct binary items and collapse into attitude categories
gss_abt_panel = prepare_binary(gss_abt_panel)
gss_abt_panel = collapse_abortion_attitudes(gss_abt_panel, binary_items=[col+"_bin" for col in BINARY_ITEMS])

In [11]:
# If degree is missing in a wave, fill it with the most recent non-missing degree
gss_abt_panel = gss_abt_panel.sort_values(by=['yearid', 'year'])
gss_abt_panel['degree'] = gss_abt_panel.groupby('yearid')['degree'].ffill().bfill()

In [12]:
# Create generation column based on birth year
def determine_generation(year):
    if pd.isna(year):
        return None
    year = int(year)
    if year <= 1945:
        return "Silent Generation"
    elif 1946 <= year <= 1964:
        return "Baby Boomer"
    elif 1965 <= year <= 1980:
        return "Generation X"
    elif 1981 <= year <= 1996:
        return "Millennial"
    elif 1997 <= year <= 2012:
        return "Generation Z"
    else:
        return "Generation Alpha"
        
gss_abt_panel['generation'] = gss_abt_panel['cohort'].apply(determine_generation)

# Create education level column
def categorize_education(edu):
    if pd.isna(edu):
        return None
    elif edu <= 1:
        return "Less or equal to high school"
    elif edu == 2 or edu == 3:
        return "Associate or Bachelor's Degree"
    elif edu == 4:
        return "Graduate Degree"
    else:
        return None

gss_abt_panel['edu_level'] = gss_abt_panel['degree'].apply(categorize_education)

# Create gender column
def categorize_gender(gender):
    if pd.isna(gender):
        return None
    elif gender == 1:
        return "Male"
    elif gender == 2:
        return "Female"
    else:
        return "Other"

gss_abt_panel['gender'] = gss_abt_panel['gender'].apply(categorize_gender)

# Create race column
def categorize_race(race):
    if pd.isna(race):
        return None
    elif race == 1:
        return "White"
    elif race == 2:
        return "Black"
    elif race == 3:
        return "Other"
    else:
        return "Unknown"

gss_abt_panel['race'] = gss_abt_panel['race'].apply(categorize_race)

In [13]:
# Drop rows with n_nonmiss < 7
gss_abt_cs2 = gss_abt_cs[gss_abt_cs['n_nonmiss'] == 7]
gss_abt_panel2 = gss_abt_panel[gss_abt_panel['n_nonmiss'] == 7]

# Finally add needed columns and save
gss_abt_cs2[['yearid', 'year', 'cohort', 'edu_level', 'gender', 'race', 'abortion_att4', 'wtssps']].to_csv('data/GSS/gss_abt_cs.csv', index=False)
gss_abt_panel2[['yearid', 'year', 'cohort', 'edu_level', 'gender', 'race', 'abortion_att4']].to_csv('data/GSS/gss_abt_panel.csv', index=False)

In [14]:
print(gss_abt_cs2.shape)
print(gss_abt_panel2.shape)

(13351, 13)
(12610, 11)


In [15]:
gss_abt_cs2['abortion_att4'].value_counts()

abortion_att4
strong_pro     5593
anti           3482
pro            2190
strong_anti    2086
Name: count, dtype: int64

In [171]:
gss_abt_cs2['abortion_att4'].value_counts()

abortion_att4
strong_pro     5593
anti           3482
pro            2190
strong_anti    2086
Name: count, dtype: int64

In [16]:
gss_abt_panel2['abortion_att4'].value_counts()

abortion_att4
strong_pro     4994
anti           3491
strong_anti    2072
pro            2053
Name: count, dtype: int64

In [172]:
gss_abt_panel2['abortion_att4'].value_counts()

abortion_att4
strong_pro     4994
anti           3491
strong_anti    2072
pro            2053
Name: count, dtype: int64

In [17]:
# Save processed data
gss_abt_cs2.to_csv('data/GSS/gss_abt_cs_full.csv', index=False)
gss_abt_panel2.to_csv('data/GSS/gss_abt_panel_full.csv', index=False)

In [19]:
gss_abt_panel2.head()

Unnamed: 0,yearid,year,n_yes,n_nonmiss,abortion_att4,cohort,degree,race,gender,generation,edu_level
0,20160001,2016,7.0,7,strong_pro,1969.0,3.0,White,Male,Generation X,Associate or Bachelor's Degree
2867,20160001,2020,7.0,7,strong_pro,1969.0,3.0,White,Male,Generation X,Associate or Bachelor's Degree
2,20160003,2016,2.0,7,anti,1944.0,3.0,White,Male,Silent Generation,Associate or Bachelor's Degree
3,20160004,2016,2.0,7,anti,1973.0,1.0,White,Male,Generation X,Less or equal to high school
2870,20160004,2020,2.0,7,anti,1973.0,1.0,White,Male,Generation X,Less or equal to high school


In [18]:
gss_abt_cs2.head()

Unnamed: 0,yearid,year,n_yes,n_nonmiss,abortion_att4,cohort,degree,race,sex,wtssps,generation,edu_level,gender
3,2006_4,2006,7.0,7,strong_pro,1956,High school,Black,FEMALE,0.385372,Baby Boomer,Less or equal to high school,Female
4,2006_5,2006,7.0,7,strong_pro,1986,High school,Black,MALE,1.387133,Millennial,Less or equal to high school,Male
8,2006_9,2006,7.0,7,strong_pro,1983,Bachelor's,Black,FEMALE,0.421749,Millennial,Associate or Bachelor's Degree,Female
9,2006_10,2006,7.0,7,strong_pro,1974,Graduate,Other,FEMALE,0.527395,Generation X,Graduate Degree,Female
11,2006_12,2006,0.0,7,strong_anti,1959,High school,Black,MALE,2.423198,Baby Boomer,Less or equal to high school,Male
