In [1]:
import pandas as pd

In [2]:
# Load the CSV file
def load_data(filepath, name):
    df = pd.read_csv(filepath, header=None)
    print(f"{name} data loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
    return df

df = load_data("../data/raw/queen-charlotte.csv", "Bridgerton")

Bridgerton data loaded successfully with 101 rows and 1 columns.


In [3]:
# 101 rows and 1 column?
# Inspect the first few rows
print(df.head())

                                                   0
0  ID,Age,Gender,Social_Standing,Wealth_Level,Edu...
1  1,22,Female,Upper,High,College,Strong,8,7,Char...
2  2,30,Male,Middle,Medium,High School,Moderate,5...
3  3,28,Female,Lower,Low,High School,Weak,6,5,Out...
4  4,25,Female,Upper,High,College,Strong,7,6,Char...


## Data Restructuring

### Properly Format the CSV file

In [4]:
# Split the single column using ',' as the delimiter
df_split = df[0].str.split(',', expand=True)

# Assign the first row of the split df as column headers
df_split.columns = df_split.iloc[0] 
df_split = df_split[1:]
df_split.reset_index(drop=True, inplace=True)

# Inspect the first few rows of the split df
print(f"Data is split into {df_split.shape[1]} columns.\n")
print(df_split.head())

Data is split into 28 columns.

0 ID Age  Gender Social_Standing Wealth_Level Education_Level  \
0  1  22  Female           Upper         High         College   
1  2  30    Male          Middle       Medium     High School   
2  3  28  Female           Lower          Low     High School   
3  4  25  Female           Upper         High         College   
4  5  35    Male           Upper    Very High         College   

0 Family_Connections Attractiveness Health Personality_Traits  ...  \
0             Strong              8      7           Charming  ...   
1           Moderate              5      8           Reserved  ...   
2               Weak              6      5           Outgoing  ...   
3             Strong              7      6        Charismatic  ...   
4             Strong              4      9            Serious  ...   

0 Parents_Marital_Status Dowry_Size Land_Ownership Political_Influence  \
0                Married      Large            Yes            Moderate   
1       

### Convert Column Headers to Lowercase

In [5]:
# This is a common practice to ensure consistency and avoid case sensitivity issues
df_split.columns = df_split.columns.astype(str).str.lower()
print("Column headers converted to lowercase.")
print(df_split.head())

Column headers converted to lowercase.
0 id age  gender social_standing wealth_level education_level  \
0  1  22  Female           Upper         High         College   
1  2  30    Male          Middle       Medium     High School   
2  3  28  Female           Lower          Low     High School   
3  4  25  Female           Upper         High         College   
4  5  35    Male           Upper    Very High         College   

0 family_connections attractiveness health personality_traits  ...  \
0             Strong              8      7           Charming  ...   
1           Moderate              5      8           Reserved  ...   
2               Weak              6      5           Outgoing  ...   
3             Strong              7      6        Charismatic  ...   
4             Strong              4      9            Serious  ...   

0 parents_marital_status dowry_size land_ownership political_influence  \
0                Married      Large            Yes            Moderate   
1

### Rename the Column from `gender` to `sex`

In [6]:
# The data strictly represents biological categories (Male/Female)
# without accounting for social or cultural contexts typically associated with 'gender'.
df_split.rename(columns={'gender': 'sex'}, inplace=True)

### Save the Restructured Data

In [8]:
df_split.to_csv("../data//interim/queen-charlotte-restructured.csv", index=False)
print("Restructured CSV saved as 'queen-charlotte-restructured.csv'")

Restructured CSV saved as 'queen-charlotte-restructured.csv'
