# Wellmum Clinic Data Analysis
*Lizemarie Wium*

## Introduction
The Wellmum Clinic was established to identify XXX. The clinic is used as a follow-up of patients who have given birth in the past XX months.

This document aims to answer the following questions:

1. Descriptive statistics
   1. Population description: Age, race, parity
   2. Describe Creatinine > 80
   3. Describe eGFR <90
   4. Describe ALT >40
   5. Describe Hb <120
   6. Describe ACR >3
   7. HbA1C <39, 39-47, >48
2. Correlations
   1. Correlation between MUAC and current BMI and waist circumference
   2. Correlation between random glucose and HbA1C
3. Relationships
   1. Relationship between eGFR >90 and ACR >3
   2. Relationship between ALT >40 and HbA1C >39
   3. Relationship between eGFR <90 and HbA1C >39


## Prepare data

In [14]:
# Import libraries
# General 
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px

# Import data and print the shape of the data
df = pd.read_excel("WELLMUM_20230609.xlsx")

# Rename columns
df = df.rename(columns={'Gravidity (how many pregnancies have you had?)':'Gravidity',
                        'Parity (number of live births)': 'Parity',
                        'random glucose': 'RandomGlucose',
                        'AGE': 'Age',
                        'Weight (kg)': 'Weight',
                        'Mid upper arm circumstance ': 'MUAC',
                        'ID ': 'Id',
                        'Current BMI': 'BMI',
                        'GDM control': 'GDMControl',
                        'Maternal date of birth': 'MDOB',
                        'Problems during pregnancy': 'Problems'})

# # Select subset of columns
cols = ['Id', 'Age', 'Gravidity', 'Parity', 'BMI', 'Weight', 'Creatinine', 
        'HbA1C', 'ACR', 'GDMControl', 'Race', 'RandomGlucose', 'eGFR', 'ALT',
        'MUAC', 'Hb', 'MDOB']
df = df[cols]

# Clean Race
df['Race'] = df['Race'].str.upper()
df['Race'] = df['Race'].str.strip()
df.loc[(df['Race']=='ASIAN'), 'Race'] = 'ASIAN'
df.loc[(df['Race']=='EAST ASIAN'), 'Race'] = 'ASIAN'
df.loc[(df['Race']=='SOUTH ASIAN'), 'Race'] = 'ASIAN'

df['BMI'] = df['BMI'].round(2)
df['HbA1C'] = df['HbA1C'].astype(float)

print(df.head(5))



      Id  Age Gravidity  Parity    BMI  Weight  Creatinine  HbA1C   ACR  \
0  WM001   34         2       1  24.85    59.7        69.0   37.0   7.4   
1  WM002   44         6       1  22.31    66.0        66.0   39.0   9.3   
2  WM003   30         1       1  22.73    58.2        63.0   40.0  91.3   
3  WM004   29         1       1  33.52   106.8        47.0   35.0   NaN   
4  WM005   36         3       3  28.04    76.8        59.0   37.0   6.6   

  GDMControl   Race  RandomGlucose        eGFR  ALT  MUAC     Hb  \
0  Metformin  WHITE            3.5  132.714169  NaN  30.0  132.0   
1    Insulin  MIXED            4.6  118.080000  NaN  27.0   70.0   
2  Metformin  BLACK            4.4  124.991429  NaN  28.0  139.0   
3  Metformin  WHITE            6.1  310.242638  NaN  29.0  124.0   
4  Metformin  WHITE            4.8  166.512814  NaN  32.0  141.0   

                  MDOB  
0  1988-08-10 00:00:00  
1  1978-05-30 00:00:00  
2  1992-05-18 00:00:00  
3  1993-06-23 00:00:00  
4  1986-06-12 0

## Descriptive statistics

### Polulation age

In [91]:
df.loc[df['Age'] < 35, 'AgeCat'] = '< 35'
df.loc[(df['Age'] > 35) & (df['Age'] <= 40), 'AgeCat'] = '35-40'
df.loc[(df['Age'] > 40) & (df['Age'] <= 45), 'AgeCat'] = '40-45'
df.loc[df['Age'] > 45, 'AgeCat'] = '> 45'

age = df.groupby(['AgeCat', 'Race']).size().reset_index(name='Counts')

age.loc[age['AgeCat'] == '< 35', 'Sort'] = 10
age.loc[age['AgeCat'] == '35-40', 'Sort'] = 20
age.loc[age['AgeCat'] == '40-45', 'Sort'] = 30
age.loc[age['AgeCat'] == '> 45', 'Sort'] = 40

age = age.sort_values(by=['Sort'])
age['Percentage'] = round(age['Counts'] / len(df) * 100, 2)

fig = go.Figure()

for race in df.Race.unique():
    aget = age[age['Race'] == race]
    fig.add_trace(go.Bar(x=aget['AgeCat'], 
                         y=aget['Percentage'], 
                         text=aget['Percentage'],
                         name=race))

fig.update_layout(
    title=go.layout.Title(
        text=f"Age distribution<br><sup><i>(n={len(df)})</i></sup>",
        xref="paper",
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="Age<br><sup><i>years</i></sup>"
            )
        ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Percentage<br><sup><i>%</i></sup>"
            )
        ),
    barmode='stack'
    )
fig.show()

### Population race

In [101]:
race = df.groupby(['Race']).size().reset_index(name='Counts')
race = race.sort_values(by=['Counts'])

fig = go.Figure([go.Bar(x=race['Race'], y=race['Counts'], text=race['Counts'])])
fig.update_layout(
    title=go.layout.Title(
        text=f"Race distribution<br><sup><i>(n={len(df)})</i></sup>",
        xref="paper",
        x=0
    ),
        xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="Race"
            )
        ),
        yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Count"
            )
        )
    )
fig.show()

### Population parity

In [136]:
par = df.groupby(['Parity']).size().reset_index(name='Counts')

#fig = go.Figure([go.Bar(x=par['Parity'], y=par['Counts'], text=par['Counts'])])
fig = go.Figure([go.Pie(labels=par['Parity'], values=par['Counts'])])
fig.update_traces(hoverinfo='label+percent', textinfo='label+percent',
                 )

#fig.update_layout(
#    title=go.layout.Title(
#        text=f"Parity distribution<br><sup><i>(n={len(df)})</i></sup>",
#        xref="paper",
    #     x=0
    # ),
    #     xaxis=go.layout.XAxis(
    #     title=go.layout.xaxis.Title(
    #         text="Parity<br><sup><i>number of live births</i></sup>"
    #         )
    #     ),
    #     yaxis=go.layout.YAxis(
    #     title=go.layout.yaxis.Title(
    #         text="Count"
    #         )
    #     )
    # )
fig.show()

### Creatinine > 80

In [144]:

df["Creat80"] = 0
df.loc[df['Creatinine'] > 80, 'Creat80'] = 1
sumc = df["Creat80"].sum()
print(f"Creatinine > 80: {round(sumc / len(df), 3)}")

fig = go.Figure()

for tra in [0, 1]:
    dft = df[df['Creat80']==tra]

    if tra == 0:
        name = 'Creatinine <= 80'

    else:
        name = 'Creatinine >  80'

    fig.add_trace(go.Splom(
                    dimensions=[
                                dict(label='RandomGlucose',
                                    values=dft['RandomGlucose']),
                                dict(label='HbA1C',
                                    values=dft['HbA1C']),
                                dict(label='eGFR',
                                    values=dft['eGFR']),
                                dict(label='Hb',
                                    values=dft['Hb']),
                                dict(label='ALT',
                                    values=dft['ALT']),
                                dict(label='Creatinine',
                                    values=dft['Creatinine']),
                                dict(label='ACR',
                                    values=dft['ACR']),
                                dict(label='Age',
                                    values=dft['Age']),
                                    ],
                    diagonal_visible=False, # remove plots on diagonal
                    text=dft['Id'],
                    name=name,
                    marker=dict(color=tra,
                                showscale=False, # colors encode categorical variables
                                line_color='white', line_width=0.5)
                    ))


fig.update_layout(
    #title='Creatinine > 80',
    title=f"Creatinine > 80<br><sup><i>(n={sumc}, {round(sumc / len(df) * 100, 2)}%)</i></sup>",
    showlegend=True,
    width=1000,
    height=1000,
)

fig.show()


Creatinine > 80: 0.117


#### Creatinine by race

In [145]:

fig = go.Figure()

for race in df.Race.unique():
    dft = df[df['Race']==race]
    name = f"{race}<br><sup><i>(n={len(dft)})</i></sup>"
    fig.add_trace(go.Box(y=dft['Creatinine'], name = name,
                         notched=True,
                    #marker_color = 'lightseagreen'
                    ))

fig.update_layout(
    title=f"Creatinine spread by race<br><sup><i>(n={len(df)})</i></sup>",
    showlegend=False,
)

fig.show()

### eGFR < 90

In [146]:
df["eGFR90"] = 0
df.loc[df['eGFR'] < 90, 'eGFR90'] = 1
sumc = df["eGFR90"].sum()
print(f"eGFR < 90: {round(sumc / len(df), 3)}")

fig = go.Figure()

for tra in [0, 1]:
    dft = df[df['eGFR90']==tra]
    if tra == 0:
        name = 'eGFR >= 90'

    else:
        name = 'eGFR < 90'

    fig.add_trace(go.Splom(
                    dimensions=[
                                dict(label='RandomGlucose',
                                    values=dft['RandomGlucose']),
                                dict(label='HbA1C',
                                    values=dft['HbA1C']),
                                dict(label='eGFR',
                                    values=dft['eGFR']),
                                dict(label='Age',
                                    values=dft['Age']),
                                dict(label='Hb',
                                    values=dft['Hb']),
                                dict(label='ALT',
                                    values=dft['ALT']),
                                dict(label='Creatinine',
                                    values=dft['Creatinine']),
                                dict(label='ACR',
                                    values=dft['ACR'])],
                    diagonal_visible=False, # remove plots on diagonal
                    text=dft['Id'],
                    name=name,
                    marker=dict(color=tra,
                                showscale=False, # colors encode categorical variables
                                line_color='white', line_width=0.5)
                    ))


fig.update_layout(
    #title='Creatinine > 80',
    title=f"eGFR < 90<br><sup><i>(n={sumc}, {round(sumc / len(df) * 100, 2)}%)</i></sup>",
    showlegend=True,
    width=1000,
    height=1000,
)

fig.show()

eGFR < 90: 0.027


#### eGFR by race

In [147]:
fig = go.Figure()

for race in df.Race.unique():
    dft = df[df['Race']==race]
    name = f"{race}<br><sup><i>(n={len(dft)})</i></sup>"
    fig.add_trace(go.Box(y=dft['eGFR'], name = name,
                         notched=True,
                    #marker_color = 'lightseagreen'
                    ))

fig.update_layout(
    #title='Creatinine > 80',
    title=f"eGFR spread by race<br><sup><i>(n={len(df)})</i></sup>",
    showlegend=False,
)

fig.show()

## Correlations


In [148]:
columns = ['HbA1C', 'BMI', 'MUAC', 'Creatinine', 'ALT', 'eGFR']
values = []

for x in columns:
    vals = []
    for y in columns:
        corr = round(df[x].corr(df[y]),4)
        vals.append(corr)
    values.append(vals)

fig = go.Figure(data=go.Heatmap(
                   z=values,
                   x=columns,
                   y=columns,
                   text=values,
                   texttemplate="%{text}",
                   hoverongaps = True, ))

fig.update_layout(
    #title='Creatinine > 80',
    title=f"Correlation matrix<br><sup><i>(n={len(df)})</i></sup>",
    showlegend=False,
)

fig.show()

### MUAC vs BMI by race

In [149]:
fig = go.Figure()
corra = round(df['BMI'].corr(df['MUAC']),4)

for race in df.Race.unique():
    dft = df[df['Race']==race]
    corr = round(dft['BMI'].corr(dft['MUAC']),4)
    name = f"{race}<br><sup><i>(correlation={corr})</i></sup>"
    
    fig.add_trace(go.Scatter(
        x=dft['BMI'], y=dft['MUAC'],
        name=name,
        mode='markers',
    ))

# Set options common to all traces with fig.update_traces
fig.update_traces(mode='markers', marker_line_width=1, marker_size=10)

fig.update_layout(
    title=go.layout.Title(
        text=f"MUAC vs BMI<br><sup><i>(correlation={corra})</i></sup>",
        xref="paper",
        x=0
    ),
        xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="BMI<br><sup><i>Body Mass Index</i></sup>"
            )
        ),
        yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="MUAC<br><sup><i>Mid Upper Arm Circumference</i></sup>"
            )
        )
    )


fig.show()

### Random Glucose vs HbA1C

In [150]:
fig = go.Figure()
corra = round(df['RandomGlucose'].corr(df['HbA1C']),4)

for race in df.Race.unique():
    dft = df[df['Race']==race]
    dft = dft[dft['HbA1C']>=18]
    dft = dft[dft['HbA1C']<=50]
    corr = round(dft['RandomGlucose'].corr(dft['HbA1C']),4)
    name = f"{race}<br><sup><i>(correlation={corr})</i></sup>"
    
    fig.add_trace(go.Scatter(
        x=dft['RandomGlucose'], y=dft['HbA1C'],
        name=name,
        mode='markers',
    ))

# Set options common to all traces with fig.update_traces
fig.update_traces(mode='markers', marker_line_width=1, marker_size=10)

fig.update_layout(
    title=go.layout.Title(
        text=f"HbA1C vs Random Glucose<br><sup><i>(correlation={corra})</i></sup>",
        xref="paper",
        x=0
    ),
        xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="Random Glucose<br><sup><i> </i></sup>"
            )
        ),
        yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="HbA1C<br><sup><i> </i></sup>"
            )
        )
    )


fig.show()