# Import Required Libraries

In [None]:
import pandas as pd # type: ignore
import config
import zipfile

# Load the cleaned ActiveMembersExtra.csv file

In [2]:
active_members_extra_clean_df = pd.read_csv(config.ACTIVE_MEMBERS_EXTRA_CLEAN)
# active_members_extra_clean_df.head()

# Load the UCI Gender by Name Dataset

In [3]:
# Read ZIP File into DataFrame
with zipfile.ZipFile(config.GENDER_BY_NAME_ZIP, 'r') as z:
    with z.open(z.namelist()[0]) as f:
        raw_uci_gender_df = pd.read_csv(f)

# Display the first few rows of the dataframe to verify loading
# raw_uci_gender_df.head()

# Adapt the UCI Gender by Name Dataset
- Drop Count/Probability columns
- Rename column "Name" to "first_name"
- Lowercase column names

In [4]:
# Filter out Count/Probability columns
uci_gender_df = raw_uci_gender_df[['Name', 'Gender']].copy()

# Rename the Name column to first_name
uci_gender_df.rename(columns={'Name': 'first_name'}, inplace=True)

# Lowercase column names
uci_gender_df.columns = uci_gender_df.columns.str.lower()

# Display the first few rows of the modified dataframe to verify changes
# uci_gender_df.head()

# Add Corrections to the UCI Gender by Name Dataset

In [5]:
# Load the 02-AllMembers-InsertGenderCorrections.csv file using pandas
gender_corrections_df = pd.read_csv(config.GENDER_BY_NAME_MISSING_RECORDS)

# Append rows from gender_corrections_df to uci_gender_df
uci_gender_df = pd.concat([uci_gender_df, gender_corrections_df], ignore_index=True)

# Display the last few rows of the merged dataframe to verify the merge
# uci_gender_df.tail()



# Parse and Merge Datasets
Parse the GenderNamesSource.csv file and merge it with the UCI Gender by Name dataset.

In [6]:
# Merge the datasets on the 'first_name' column
merged_df = pd.merge(active_members_extra_clean_df, uci_gender_df, on='first_name', how='left', suffixes=('_left', ''))

# Display the first few rows of the merged dataframe to verify the merge
# merged_df.head()

# Save the New Dataset

In [7]:
# Save the new dataset with the gender column to a CSV file
merged_df.to_csv(config.ACTIVE_MEMBERS_EXTRA_GENDER, index=False)