In [1]:
# Import necessary libraries

import pandas as pd
import sweetviz as sv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# Load your dataset into a Pandas DataFrame (replace 'C:\Users\Shardul\OneDrive - London Business School\coding\anya\data\SBRI P2 PPIE survey responses.xlsx' with your filepath)
df = pd.read_excel(r'C:\Users\Shardul\OneDrive - London Business School\coding\anya\data\SBRI P2 PPIE survey responses.xlsx')


In [3]:
# Delete rows 1 and 3
df = df.drop([1, 3])

# Make the second row the header
new_header = df.iloc[0] # Grab the first row for the header
df = df[1:] # Take the data excluding the header row
df.columns = new_header # Set the header row as the DataFrame header

# Reset index
df.reset_index(drop=True, inplace=True)



In [4]:
# Check for duplicate column names
duplicate_columns = df.columns[df.columns.duplicated()]
if len(duplicate_columns) > 0:
    print("Duplicate column names detected:")
    for col_name in duplicate_columns:
        print(col_name)
else:
    print("No duplicate column names found.")

No duplicate column names found.


In [5]:
# # Generate the Sweetviz report
# report = sv.analyze(df, pairwise_analysis='off')
# # Display the report
# report.show_html(r'C:\Users\Shardul\OneDrive - London Business School\coding\anya\output\report_renamed.html')

In [6]:
# Identify columns with 100% missing values
columns_to_drop = df.columns[df.isnull().all()]

# Print the names of columns that will be dropped
print("Columns with 100% missing values:", columns_to_drop.tolist())

# Drop columns with 100% missing values
df.drop(columns=columns_to_drop, inplace=True)

Columns with 100% missing values: []


We want to rename columns from Long Questions to Precise Variable names. 

In [7]:
# Dictionary mapping original column names to concise ones
column_mapping = {
    "1 - In this survey, we want to hear views from parents who gave birth, and partners. Are you the birthing parent or the partner?": "birthing_parent_or_partner",
    "2 - Which best describes your gender?": "gender",
    "3 - Which of these describes you best?": "pregnancy_stage",
    "4 - Roughly how many weeks pregnant were you when you (or your partner) notified this pregnancy to a health professional?": "anc_weeks_pregnant_notified",
    "5 - Where did you primarily get information about your options for where to give birth?": "anc_birth_info_source",
    "6 - How satisfied were you with the information you got from your healthcare provider about your birth options?": "anc_satisfaction_birth_info",
    "7 - At your antenatal appointments, how often did you see or speak to the same health professional?": "anc_repeat_provider_frequency",
    "8 - How did the majority of your antenatal appointments take place?": "anc_antenatal_appointment_mode",
    "9 - How easy was it for you to attend your antenatal appointments?": "anc_antenatal_attendance_ease",
    "10 - Please tell us why it was not easy to access your appointments.": "anc_antenatal_access_issues",
    "11 - During your antenatal appointments, did your health professional appear to be aware of your medical history?": "anc_provider_aware_medical_history",
    "12 - During your antenatal appointments, were you given enough time to ask questions about pregnancy, birth and parenting?": "anc_antenatal_question_time",
    "13 - During your antenatal appointments, did you feel involved in decisions about your care?": "anc_antenatal_decision_involvement",
    "14 - During your antenatal care, did you receive help you needed with your mental health?": "anc_mental_health_support_antenatal",
    "15 - If you have a long-term health condition, did you receive information on how this would affect your pregnancy?": "anc_long_term_condition_info",
    "16 - If you had a pregnancy related health condition, did you receive information on how this would affect your pregnancy?": "anc_pregnancy_condition_info",
    "17 - During your pregnancy, did you receive helpful information about feeding your baby?": "anc_baby_feeding_info_pregnancy",
    "18 - Did you have confidence and trust in the staff caring for you during your antenatal care?": "anc_antenatal_staff_trust",
    "19 - If you raised a concern during your antenatal care, did you feel that it was taken seriously?": "anc_antenatal_concern_serious",
    "20 - How easy was it for you to attend antenatal appointments with your partner?": "anc_partner_antenatal_difficulty",
    "21 - Please tell us why it was not easy to access your appointments.": "anc_antenatal_access_difficulty_reason",
    "22 - During the antenatal period, did you receive help you needed with your mental health?": "anc_mental_health_support_antenatal_period",
    "23 - Did you receive helpful information about feeding your baby?": "anc_baby_feeding_info",
    "24 - My health professional made assumptions about me": "anc_provider_assumptions",
    "25 - I encountered challenges in asking my health professional for help.": "anc_provider_help_challenges",
    "26 - It was difficult to find resources that represented me": "anc_resource_represent_difficulty",
    "27 - It was difficult to find the information I needed": "anc_info_find_difficulty",
    "28 - I experienced discrimination while accessing healthcare services.": "anc_healthcare_discrimination",
    "29 - It was difficult for me to get to routine appointments": "anc_routine_appointment_difficulty",
    "30 - I moved house during this time and this made it difficult to get the care I needed.": "anc_moved_house_care_difficulty",
    "31 - It was difficult balancing pregnancy whilst caring for my older children.": "anc_pregnancy_balance_care_children",
    "32 - My health professional didn't respect me.": "anc_provider_disrespect",
    "33 - My friends or family didn't support me.": "anc_lack_family_support",
    "34 - I had relationship challenges.": "anc_relationship_challenges",
    "35 - It was difficult balancing pregnancy alongside work or study.": "anc_pregnancy_balance_work_study",
    "36 - I felt financially insecure.": "anc_financial_insecurity",
    "37 - I was worried about returning to work or study after parental leave": "anc_work_study_return_worry",
    "38 - Negative views about pregnancy from my peers affected me.": "anc_peer_pregnancy_negative_views",
    "39 - Family views and pressures about pregnancy affected me": "anc_family_pregnancy_pressures",
    "40 - I experienced mental health challenges.": "anc_mental_health_challenges",
    "41 - I experienced physical health challenges.": "anc_physical_health_challenges",
    "42 - I was worried about my baby's health or birth complications": "anc_baby_health_birth_worry",
    "43 - Please share any other challenges or experiences not covered above.": "anc_additional_challenges_experiences",
    "44 - Shortly before the birth of your baby, how empowered did you feel in your ability to take care of your child?": "anc_birth_empowerment",
    "45 - Thinking about your care during labour and birth, did you feel involved in decisions about your care?": "anc_labour_decision_involvement",
    "46 - After your baby was born, did you have the opportunity to ask questions about your labour and birth?": "anc_birth_question_opportunity",
    "47 - One a scale of 1-5, how well do you think your questions were answered?": "anc_question_response_scale",
    "48 - At what gestational age was your baby born? (in weeks)": "gestational_age_weeks",
    "49 - How was your baby born?": "birth_type",
   "50 - Thinking about the care you received immediately after the birth of your baby, were you given the information you needed about postnatal care and recovery?": "birth_postnatal_info_sufficiency",
    "51 - Thinking about the care you received immediately after the birth of your baby, were you treated with kindness and understanding?": "birth_postnatal_kindness",
    "52 - In the first few days of life, how was your baby fed?": "birth_early_life_feeding",
    "53 - Were your feelings about how you wanted to feed your baby respected by healthcare professionals?": "birth_feeding_preferences_respected",
    "54 - Thinking about labour and birth, did you have the knowledge you needed, to understand decisions being made about your partner's or baby's care?": "birth_labour_birth_knowledge",
    "55 - After your baby was born, did you have the opportunity to ask questions about the birth of your baby?": "birth_baby_birth_question_opportunity",
    "56 - One a scale of 1-5, how well do you think your questions were answered?": "birth_question_response_quality",
    "57 - At what gestational age was your baby born? (in weeks)": "partner_gestational_age_weeks",
    "58 - How was your baby born?": "partner_birth_type",
    "59 - Thinking about the care you and your partner received immediately after the birth of your baby, were you given the information you needed about postnatal care and recovery for your partner?": "partner_postnatal_info",
    "60 - Thinking about the care you and your partner received immediately after the birth of your baby, were you treated with kindness and understanding?": "partner_postnatal_kindness",
    "61 - In the first few days of life, how was your baby fed?": "partner_early_life_feeding",
   "62 - Thinking about your postnatal care, did you feel involved in decisions about your care?": "postnatal_decision_involvement",
    "63 - If your baby needed medical care after birth, did you feel involved in decisions about their care?": "postnatal_baby_medical_care_decisions",
    "64 - If you contacted a health professional for support, were you given the help you needed?": "postnatal_professional_support_help",
    "65 - Since your baby’s birth have you been visited at home by a health professional?": "postnatal_home_professional_visit",
    "66 - At your routine appointments, or during your baby's care, how often did you see or speak to the same health professional?": "postnatal_routine_appointments_provider",
    "67 - Would you have preferred to have seen or spoken to a health professional…": "postnatal_provider_preference",
    "68 - Did the health professional(s) that you saw or spoke to appear to be aware of the medical history of you and your baby?": "postnatal_provider_medical_history_awareness",
    "69 - Did you have confidence and trust in the health professionals you saw or spoke to after going home?": "postnatal_home_provider_trust",
    "70 - Were you given information about any changes you might experience to your mental health after having your baby?": "postnatal_mental_health_info_postpartum",
    "71 - Did you receive care and support you needed with regards to your mental health?": "postnatal_mental_health_support_needed",
    "72 - Were you given information about your own physical recovery after the birth?": "postnatal_physical_recovery_info",
    "73 - How useful was this information?": "postnatal_physical_recovery_info_useful",
    "74 - When it came to feeding your baby, where did you access support and information?": "postnatal_feeding_support_source",
    "75 - How useful was this information?": "postnatal_feeding_support_info_useful",
    "76 - Have you ever attended a breastfeeding or infant feeding class?": "postnatal_feeding_class_attendance",
    "77 - Was this breastfeeding or infant feeding class useful?": "postnatal_feeding_class_usefulness",
    "78 - How helpful was this information?": "postnatal_info_helpfulness",
    "79 - Did the health professional(s) that you saw or spoke to appear to be aware of the medical history of you and your baby?": "postnatal_provider_medical_history_awareness_2",
   "80 - Were you given information about any changes you might experience to your mental health after having your baby?": "partner_pnc_postnatal_mental_health_info_postpartum_2",
    "81 - Did you receive care and support you needed with regards to your mental health?": "partner_pnc_mental_health_care_support",
    "82 - Were you given information about your own physical recovery after the birth?": "partner_pnc_physical_recovery_info_2",
    "83 - How useful was this information?": "partner_pnc_physical_recovery_info_useful_2",
    "84 - When it came to feeding your baby, where did you access support and information?": "partner_pnc_feeding_support_info",
    "85 - How useful was this information?": "partner_pnc_feeding_support_info_usefulness",
    "86 - Have you ever attended a breastfeeding or infant feeding class?": "partner_pnc_feeding_class_attendance",
    "87 - Was this breastfeeding or infant feeding class useful?": "partner_pnc_feeding_class_usefulness",
    "88 - Please tell us about any challenges or experiences that weren't covered here": "partner_pnc_additional_challenges_experiences_2",
    "89 - Were you given information about any changes you might experience to your mental health after having your baby?": "partner_pnc_postnatal_mental_health_info",
    "90 - When it came to feeding your baby, where did you access support and information?": "partner_pnc_feeding_support_info",
    "91 - How useful was this information?": "partner_pnc_feeding_info_usefulness",
    "92 - If, during evenings, nights, or weekends, you needed support or advice about feeding your baby, where did you access this out-of-hours support?": "partner_pnc_out_of_hours_feeding_support",
    "93 - How useful was this information?": "partner_pnc_out_of_hours_info_usefulness",
    "94 - In the six weeks after the birth of your baby did you receive advice from health professionals about your baby’s development?": "partner_pnc_postnatal_baby_development_advice",
    "95 - How useful was this information?": "partner_pnc_baby_development_advice_usefulness",
    "141 - Token": "token",
    "142 - Submitted at": "submitted_at"
}




In [8]:
# Extended dictionary mapping original column names to concise ones
column_mapping.update({
    "96 - Did you provide any breastmilk to your baby within the first 48 hours after birth?": "initial_breastfeeding",
    "97 - Can you tell us about why you didn't breastfeed?": "non_breastfeeding_reason",
    "98 - How long after birth did you start breastfeeding?": "breastfeeding_start_time",
    "99 - Are you still providing your baby with breastmilk (either direct feeding or expressing)?": "current_breastfeeding_status",
    "100 - For how long did you breastfeed?": "breastfeeding_duration",
    "101 - Could you please tell us about how you came to stop breastfeeding?": "breastfeeding_stop_reason",
    "102 - Did your baby receive any breastmilk within the first 48 hours after birth?": "baby_initial_breastfeeding",
    "103 - Can you tell us about why your partner didn't breastfeed?": "partner_non_breastfeeding_reason",
    "104 - Is your baby still receiving breastmilk (either direct feeding or expressing)?": "baby_current_breastfeeding_status",
    "105 - Could you please tell us about how your partner came to stop breastfeeding?": "partner_breastfeeding_stop_reason",
    "106 - My health professional made assumptions about me": "pnc_provider_assumptions",
    "107 - I found it difficult to ask my health professional for help.": "pnc_provider_help_difficulty",
    "108 - It was difficult to find resources that represented me": "pnc_resource_representation_difficulty",
    "109 - It was difficult to find the information I needed": "pnc_info_access_difficulty",
    "110 - I experienced discrimination while accessing healthcare services.": "pnc_healthcare_discrimination",
    "111 - Attending appointments was difficult for me.": "pnc_appointment_difficulty",
    "112 - I moved house and this made it difficult to get the care I needed.": "pnc_house_move_care_difficulty",
    "113 - It was difficult caring for my baby whilst also caring for my older children.": "pnc_baby_care_difficulty_with_older_children",
    "114 - My health professional didn't respect me.": "pnc_provider_disrespect",
    "115 - My friends or family didn't support me.": "pnc_family_support_lack",
    "116 - I had relationship challenges.": "pnc_relationship_issues",
    "117 - It was difficult balancing parenting alongside work or study.": "pnc_parenting_work_study_balance_difficulty",
    "118 - I felt financially insecure during the time my baby was 0-2 years old.": "pnc_financial_insecurity",
    "119 - Negative views from peers about my parenting choices affected me.": "pnc_peer_parenting_negative_views",
    "120 - Negative family views and pressures about my parenting choices affected me": "pnc_family_parenting_views_pressure",
    "121 - I experienced mental health challenges": "pnc_mental_health_challenges",
    "122 - I experienced physical health challenges": "pnc_physical_health_challenges",
    "123 - I was worried about my baby's health": "pnc_baby_health_worry",
    "124 - Please share any other challenges or experiences not covered above.": "pnc_additional_experiences",
    "125 - What year were you born?": "demo_birth_year",
    "126 - How many pregnancies have you or your partner had before this one?": "demo_previous_pregnancies_count",
    "127 - We understand that some families are sadly affected by pregnancy or infant loss. How many children do you have?": "demo_current_children_count",
    "128 - Do you (or your partner, if they were the one giving birth) have any of the following physical or mental health conditions, disabilities or illnesses that have lasted or are expected to last 12 months or more and negatively impact on day-to-day l...": "demo_long_term_health_issues",
    "129 - Did you (or your partner, if they were pregnant) have any of the following pregnancy-related health conditions during this pregnancy?": "demo_pregnancy_related_conditions",
    "130 - What is your religion?": "demo_religion",
    "131 - Is English your main language?": "demo_main_language_english",
    "132 - Which languages do you most frequently use for conversation during the course of your day?": "demo_frequent_conversation_languages",
    "133 - Which languages can you read well?": "demo_readable_languages",
    "134 - What is your post code?": "demo_post_code",
    "135 - What is your ethnic group?": "demo_ethnic_group",
    "136 - Are you a UK Citizen?": "demo_uk_citizenship",
    "137 - Have you used the Anya app during your pregnancy or after childbirth?": "demo_anya_app_usage",
    "138 - Is there anything else you'd like to share about your maternity experience?": "demo_additional_maternity_experience",
    "139 - As we progress with this and other exciting projects, we are keen to keep hearing from our users. Would you be happy for us to contact you to support Anya's future activities?": "demo_anya_future_contact",
    "140 - What is your email address": "demo_email_address"
})

# Use this dictionary to rename columns
df_encoded = df.rename(columns=column_mapping)


We'll store this mapping in an excel

In [9]:
# Initialize lists to hold the extracted data
question_numbers = []
questions = []
variables = []

# Iterate through the dictionary to split keys (questions) and save the values (variables)
for key, value in column_mapping.items():
    # Split the key into the question number and the question text
    parts = key.split(" - ", 1)
    question_numbers.append(parts[0])
    questions.append(parts[1])
    variables.append(value)


# Create a DataFrame from the lists
df_questions = pd.DataFrame({
    "Question Number": question_numbers,
    "Question": questions,
    "Variable": variables
})

df_questions.to_csv(r"C:\Users\Shardul\OneDrive - London Business School\coding\anya\data\ppie-questions.csv")

Let's add some replacement rules to make our life easier when plotting graphs or frequency tables. 

In [10]:
# Define the replacement rules
replacements = {
    'Yes, always': 'Yes',
    'Yes, definitely': 'Yes',
    'Yes, sometimes': 'Yes, sometimes',
    'Yes, partly': 'Yes, sometimes',
    'Yes, to some extent': 'Yes, sometimes'
}

# Replace "Not Applicable/ Don't want to answer" with "Other" across the entire DataFrame
df_encoded.replace("Not Applicable/ Don't want to answer", "Other", inplace=True)
df_encoded.replace("Not applicable/rather not say", "Other", inplace=True)
df_encoded.replace("I did not want/need to be involved", "Other", inplace=True)

df_encoded.replace("Strongly disagree", "Strongly Disagree", inplace=True)
df_encoded.replace("Strongly agree", "Strongly Agree", inplace=True)

# Apply the replacements across the entire DataFrame
df_encoded.replace(replacements, inplace=True)

In [11]:
print(df_encoded.columns.tolist())


['birthing_parent_or_partner', 'gender', 'pregnancy_stage', 'anc_weeks_pregnant_notified', 'anc_birth_info_source', 'anc_satisfaction_birth_info', 'anc_repeat_provider_frequency', 'anc_antenatal_appointment_mode', 'anc_antenatal_attendance_ease', 'anc_antenatal_access_issues', 'anc_provider_aware_medical_history', 'anc_antenatal_question_time', 'anc_antenatal_decision_involvement', 'anc_mental_health_support_antenatal', 'anc_long_term_condition_info', 'anc_pregnancy_condition_info', 'anc_baby_feeding_info_pregnancy', 'anc_antenatal_staff_trust', 'anc_antenatal_concern_serious', 'anc_partner_antenatal_difficulty', 'anc_antenatal_access_difficulty_reason', 'anc_mental_health_support_antenatal_period', 'anc_baby_feeding_info', 'anc_provider_assumptions', 'anc_provider_help_challenges', 'anc_resource_represent_difficulty', 'anc_info_find_difficulty', 'anc_healthcare_discrimination', 'anc_routine_appointment_difficulty', 'anc_moved_house_care_difficulty', 'anc_pregnancy_balance_care_childre

We'll make some variables that will be binary ( 0 or 1) depending on whether they indicate membership in one of our target groups.

1. Ethnic minority: is_ethnic_minority --> demo_ethnic_group
2. Parents living in IMD1&2: imd_1_2 --> demo_post_code 
3. Aged under 25: age_less_than_25 --> demo_birth_year
4. Parents of premature babies (defined as less than 37 weeks gestation): premature_birth --> 
5. Non-native English speakers: non_native_english_speaker --> demo_main_language_english
6. Partners: partner --> birthing_parent_or_partner
7. Obesity: comorbidity_obesity --> demo_long_term_health_issues


In [12]:
# Descriptive summaries for the original variables
original_columns = ['demo_ethnic_group', 'demo_post_code', 'demo_birth_year', 'gestational_age_weeks', 'demo_main_language_english', 'birthing_parent_or_partner', 'demo_long_term_health_issues']

def descriptive_summary(df, column):
    description = {
        'Column Name': column,
        'Data Type': df[column].dtype,
        'Number of Unique Levels': df[column].nunique(),
        'Levels': df[column].unique(),
        'Frequency': df[column].value_counts(dropna=False)
    }
    return description

print("Descriptive Summaries for Original Variables:\n")
for col in original_columns:
    summary = descriptive_summary(df_encoded, col)
    print(f"Descriptive Summary for {col}:\n")
    for key, value in summary.items():
        print(f"{key}: {value}")
    print("\n")

Descriptive Summaries for Original Variables:

Descriptive Summary for demo_ethnic_group:

Column Name: demo_ethnic_group
Data Type: object
Number of Unique Levels: 17
Levels: ['Arab' 'White: English / Welsh / Scottish / Northern Irish / British'
 'Asian/Asian British: Indian' 'White: Any other White background'
 'I would prefer not to say'
 'Mixed/multiple ethnic groups: Any other Mixed / multiple ethnic background'
 'Asian/Asian British: Chinese'
 'Black/African/Caribbean/Black British: African background'
 'Asian/Asian British: Pakistani'
 'Asian/Asian British: Any other Asian background'
 'Mixed/multiple ethnic groups: White and Asian'
 'Mixed/multiple ethnic groups: White and Black African'
 'Asian/Asian British: Bangladeshi'
 'Mixed/multiple ethnic groups: White and Black Caribbean'
 'Any other ethnic group' 'White: Irish'
 'Black/African/Caribbean/Black British: Any other Black / Black British / Caribbean background']
Frequency: demo_ethnic_group
White: English / Welsh / Scottis

In [13]:
## Ethnic Minority

In [14]:
# Create a binary variable where 0 indicates the person is White (specifically starting with "White: ")
df_encoded['is_ethnic_minority'] = np.where(
    df_encoded['demo_ethnic_group'].str.startswith("White:"),
    0,  # Person is White, specifically starting with "White: "
    1   # Person is not White
)

# Print to verify that the new column is correct
print(df_encoded[['demo_ethnic_group', 'is_ethnic_minority']])

0                                    demo_ethnic_group  is_ethnic_minority
0                                                 Arab                   1
1    White: English / Welsh / Scottish / Northern I...                   0
2    White: English / Welsh / Scottish / Northern I...                   0
3    White: English / Welsh / Scottish / Northern I...                   0
4    White: English / Welsh / Scottish / Northern I...                   0
..                                                 ...                 ...
186  Black/African/Caribbean/Black British: Any oth...                   1
187  White: English / Welsh / Scottish / Northern I...                   0
188  White: English / Welsh / Scottish / Northern I...                   0
189  White: English / Welsh / Scottish / Northern I...                   0
190  White: English / Welsh / Scottish / Northern I...                   0

[191 rows x 2 columns]


In [15]:
## Age

In [16]:
df_encoded['age'] = 2024 - df_encoded['demo_birth_year']

df_encoded = df_encoded[(df_encoded['age'] >= 18) & (df_encoded['age'] < 100)]

df_encoded['age_less_than_25'] = df_encoded['age'].apply(lambda x: 1 if x < 25 else 0)

In [17]:
df_encoded['age'].value_counts()

age
36    19
39    17
32    14
33    13
35    13
34    11
37    10
31    10
28     8
30     8
27     8
25     6
40     6
29     6
42     5
43     5
23     4
22     4
24     4
41     4
26     3
45     2
21     2
38     2
47     1
18     1
44     1
Name: count, dtype: int64

In [18]:
## IMD

In [19]:
# Step 1: Read the CSV file
csv_path = r'C:\Users\Shardul\OneDrive - London Business School\coding\anya\data\2019-deprivation-by-postcode.csv'
## Write instructions


imd_data = pd.read_csv(csv_path)

# Display the first few rows of the CSV data to understand its structure
print(imd_data.head())

# Step 2: Extract relevant columns (Postcode and IMD Decile)
imd_data = imd_data[['Postcode', 'Index of Multiple Deprivation Decile']]

# Step 3: Ensure the postcode column names match in both DataFrames
imd_data.rename(columns={'Postcode': 'demo_post_code', 'Index of Multiple Deprivation Decile': 'IMD_decile'}, inplace=True)

# Assuming df_encoded is your existing DataFrame, display its first few rows
print(df_encoded.head())

# Step 4: Merge the data based on the postcode
df_encoded = df_encoded.merge(imd_data, on='demo_post_code', how='left')

# Display the updated DataFrame with the new IMD_decile column
print(df_encoded.head())


     Postcode Postcode Status  LSOA code                       LSOA Name  \
0  "'SO172NR"            Live  E01017233      Southampton 013C E01017233   
1  "'Wn8 6hh"            Live  E01025481  West Lancashire 010C E01025481   
2  "'FY2 0QQ"            Live  E01012690        Blackpool 004A E01012690   
3   "'Wa87dx"            Live  E01012424           Halton 007C E01012424   
4  "'Fy2 0pd"            Live  E01012692        Blackpool 004C E01012692   

   Index of Multiple Deprivation Rank  Index of Multiple Deprivation Decile  \
0                             11587.0                                   4.0   
1                              4491.0                                   2.0   
2                             12907.0                                   4.0   
3                               690.0                                   1.0   
4                              3815.0                                   2.0   

   Income Rank  Income Decile  Income Score  Employment Rank  ...  \

In [20]:
df_encoded['IMD_decile'].describe()

count    155.000000
mean       4.470968
std        2.796820
min        1.000000
25%        2.000000
50%        4.000000
75%        7.000000
max       10.000000
Name: IMD_decile, dtype: float64

In [21]:
df_encoded['imd_1_2'] = df_encoded['IMD_decile'].apply(lambda x: 1 if x < 3 else 0)

In [22]:
## Premature Birth
df_encoded['premature_birth'] = df_encoded['gestational_age_weeks'].apply(lambda x: 1 if x < 37 else 0)

df_encoded['premature_birth'].value_counts()


premature_birth
0    171
1     24
Name: count, dtype: int64

In [23]:
## Is Partner
df_encoded['is_partner'] = df_encoded['birthing_parent_or_partner'].apply(lambda x: 1 if x == 'Partner' else 0)

df_encoded['is_partner'].value_counts()


is_partner
0    184
1     11
Name: count, dtype: int64

In [24]:
## Is Non-Native English Speaker

df_encoded['non_native_english_speaker'] = df_encoded['demo_main_language_english'].apply(lambda x: 0 if x == 'Yes' else 1)


df_encoded['non_native_english_speaker'].value_counts()


non_native_english_speaker
0    169
1     26
Name: count, dtype: int64

In [25]:
## Has Obesity

# Create a function to check if 'obesity' is present in the health issues string
def has_obesity(issue):
    if isinstance(issue, str) and 'obesity' in issue.lower():
        return 1
    else:
        return 0

# Apply the function to create the binary variable
df_encoded['comorbidity_obesity'] = df_encoded['demo_long_term_health_issues'].apply(has_obesity)

# Display the first few rows to verify
df_encoded['comorbidity_obesity'].value_counts()

comorbidity_obesity
0    176
1     19
Name: count, dtype: int64

We'll create a variable for antenatal and postnatal parents as well

In [26]:

# Define the conditions for the new variable
conditions = [
    df_encoded['pregnancy_stage'].isin(["I have a baby aged less than 3 months", "I have a baby aged 3-6 months"]),
    df_encoded['pregnancy_stage'].isin(["I have a baby aged 6-12 months", "I have a baby aged 12-24 months"])
]

# Define the values corresponding to the conditions
values = [
    '0-6 months',  # Group for babies less than 6 months old
    'Older than 6 months'  # Group for babies older than 6 months
]

# Use numpy.select to apply these conditions and values
df_encoded['baby_age_group'] = np.select(conditions, values, default=np.nan)

# Display the DataFrame to see the result
print(df_encoded['baby_age_group'].value_counts())

baby_age_group
0-6 months             90
Older than 6 months    72
nan                    33
Name: count, dtype: int64


In [27]:
# Create a binary variable where 1 indicates the baby's age is greater than 6 months
df_encoded['baby_age_over_6_months'] = np.where(
    df_encoded['pregnancy_stage'].isin(["I have a baby aged 6-12 months", "I have a baby aged 12-24 months"]),
    1,  # Baby age greater than 6 months
    0,   # Baby age 6 months or less, or other situations
)



In [28]:
df_encoded['birth_type'].value_counts()

def categorize_birth_type(birth_type):
    if pd.isna(birth_type):
        return "Unknown"  # or any other category you prefer for NaN values
    elif "planned cesarean" in str(birth_type).lower():
        return "Planned Cesarean Section"
    elif "emergency cesarean" in str(birth_type).lower():
        return "Emergency Cesarean Section"
    elif "vaginal delivery" in str(birth_type).lower():
        return "Vaginal Delivery"
    elif "assisted vaginal birth" in str(birth_type).lower():
        return "Assisted Vaginal Birth"
    else:
        return "Other"

# Apply categorization to create a new column
df_encoded['birth_type_category'] = df_encoded['birth_type'].apply(categorize_birth_type)

# Check the distribution of the new categories
print(df_encoded['birth_type_category'].value_counts())



birth_type_category
Vaginal Delivery              53
Emergency Cesarean Section    52
Assisted Vaginal Birth        34
Planned Cesarean Section      32
Unknown                       24
Name: count, dtype: int64


In [29]:
df_encoded['birth_early_life_feeding'].value_counts()
def categorize_feeding_method(feeding_method):
    if pd.isna(feeding_method):
        return "Missing Data"  # or any other category you prefer for NaN values
    elif "Breastfed or expressed breastmilk only" in feeding_method:
        return "Exclusive Breastfeeding"
    elif "Both breastfed and formula-fed" in feeding_method:
        return "Combination Feeding"
    elif "My baby was in a neonatal unit" in feeding_method or "feeding tube" in feeding_method:
        return "Special Circumstances"
    elif "Formula-fed only" in feeding_method:
        return "Exclusive Formula Feeding"
    else:
        return "Other"

# Apply categorization to create a new column
df_encoded['feeding_category'] = df_encoded['birth_early_life_feeding'].apply(categorize_feeding_method)

# Check the distribution of the new categories
print(df_encoded['feeding_category'].value_counts())


feeding_category
Exclusive Breastfeeding      110
Combination Feeding           38
Missing Data                  24
Special Circumstances         13
Exclusive Formula Feeding      7
Other                          3
Name: count, dtype: int64


In [30]:
## Is Anys User

df_encoded['anya_user'] = df_encoded['demo_anya_app_usage'].apply(lambda x: 1 if x == 'Yes' else 0)


df_encoded['anya_user'].value_counts()


anya_user
1    116
0     79
Name: count, dtype: int64

In [31]:
df_encoded.to_excel(r'C:\Users\Shardul\OneDrive - London Business School\coding\anya\data\ppie-cleaned-v3.xlsx', engine='openpyxl')