# Cardiometabolic Correlate and Maternal Health Group 4

### Omar Barabandi, Ayham Elayan, Zahra Izzi, Yara Yaghi

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

### 1. Read the Data

In [11]:
dataset = pd.read_csv('synth_New_Diagnosis.csv')

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1227375 entries, 0 to 1227374
Data columns (total 10 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   DIAG_ID        1227375 non-null  int64  
 1   PERSON_ID      1227375 non-null  int64  
 2   VISIT_ID       1227375 non-null  int64  
 3   RECORDED_DATE  1227375 non-null  int64  
 4   RECORDED_TIME  1227375 non-null  object 
 5   DIAG_DATE      1227375 non-null  float64
 6   DIAG_TIME      1227375 non-null  object 
 7   SUBCATEGORY    1227375 non-null  object 
 8   EVENT_DESC     1227375 non-null  object 
 9   VALUE          1227375 non-null  object 
dtypes: float64(1), int64(4), object(5)
memory usage: 93.6+ MB


In [13]:
dataset.head()

Unnamed: 0,DIAG_ID,PERSON_ID,VISIT_ID,RECORDED_DATE,RECORDED_TIME,DIAG_DATE,DIAG_TIME,SUBCATEGORY,EVENT_DESC,VALUE
0,0,5887,120199,3,07:43:51.0000000,19.0,00:33:00.0000000,Final,"Proteinuria, unspecified",R80.9
1,1,20661,187867,81,15:43:42.0000000,93.0,08:27:00.0000000,Final,"Acute upper respiratory infection, unspecified",J06.9
2,2,17856,204978,-7,11:38:49.0000000,-2.0,19:09:00.0000000,Admitting Diagnosis,Encounter for antenatal screening for malforma...,Z3A.16
3,3,3730,80795,130,13:34:09.0000000,117.0,13:27:00.0000000,Final,20 weeks gestation of pregnancy,J06.9
4,4,9908,167881,86,15:16:45.0000000,150.0,10:26:00.0000000,Discharge Diagnosis,20 weeks gestation of pregnancy,Z3A.20


In [14]:
dataset.tail()

Unnamed: 0,DIAG_ID,PERSON_ID,VISIT_ID,RECORDED_DATE,RECORDED_TIME,DIAG_DATE,DIAG_TIME,SUBCATEGORY,EVENT_DESC,VALUE
1227370,1227370,8169,143372,114,14:09:23.0000000,85.0,11:43:00.0000000,Final,Smoking (tobacco) complicating childbirth,O99.334
1227371,1227371,4024,29049,332,10:04:54.0000000,367.0,13:58:00.0000000,Final,26 weeks gestation of pregnancy,Z3A.26
1227372,1227372,5468,203118,108,07:15:41.0000000,108.0,10:00:00.0000000,Final,Nausea,R11.0
1227373,1227373,27730,205548,37,07:07:59.0000000,38.0,13:00:00.0000000,Final,"Supervision of high risk pregnancy, unspecifie...",Z3A.33
1227374,1227374,4845,71824,1407,15:21:03.0000000,125.0,08:56:00.0000000,Final,Paresthesia of skin,R20.2


In [17]:
dataset.rename(columns={
    'PERSON_ID': 'PersonID',
    'RECORDED_DATE': 'RecordedDate',
    'RECORDED_TIME': 'RecordedTime',
    'DIAG_DATE': 'DiagDate',
    'DIAG_TIME': 'DiagTime',
    'SUBCATEGORY': 'Subcategory',
    'EVENT_DESC': 'EventDesc',
    'VALUE': 'DiagCode'
}, inplace=True)

In [18]:
dataset.head() #Testing Renaming

Unnamed: 0,DIAG_ID,PersonID,VISIT_ID,RecordedDate,RecordedTime,DiagDate,DiagTime,Subcategory,EventDesc,DiagCode
0,0,5887,120199,3,07:43:51.0000000,19.0,00:33:00.0000000,Final,"Proteinuria, unspecified",R80.9
1,1,20661,187867,81,15:43:42.0000000,93.0,08:27:00.0000000,Final,"Acute upper respiratory infection, unspecified",J06.9
2,2,17856,204978,-7,11:38:49.0000000,-2.0,19:09:00.0000000,Admitting Diagnosis,Encounter for antenatal screening for malforma...,Z3A.16
3,3,3730,80795,130,13:34:09.0000000,117.0,13:27:00.0000000,Final,20 weeks gestation of pregnancy,J06.9
4,4,9908,167881,86,15:16:45.0000000,150.0,10:26:00.0000000,Discharge Diagnosis,20 weeks gestation of pregnancy,Z3A.20


In [19]:
# Filter the data to include only final diagnoses
final_diagnoses = dataset[dataset['Subcategory'] == 'Final'] 

In [23]:
final_diagnoses.shape

(739873, 10)

In [24]:
# Calculate and display the value counts of the diagnosis codes
diag_code_counts = final_diagnoses['DiagCode'].value_counts() 
diag_code_counts

Z37.0      11341
Z34.90      9348
O99.89      9335
E66.01      8691
O09.893     8508
           ...  
N87.1          1
O09.70         1
M54.32         1
Z04.8          1
Z56.5          1
Name: DiagCode, Length: 3055, dtype: int64

In [25]:
# Check the distribution of a specific diagnosis code (e.g., 'J06.9')
specific_diag = final_diagnoses[final_diagnoses['DiagCode'] == 'J06.9']
specific_diag.shape

(2265, 10)

In [26]:
# Further filtering or analysis as required
# For example, filtering based on specific dates or times
date_filtered = final_diagnoses[final_diagnoses['DiagDate'] > 50]
date_filtered.shape

(487376, 10)

In [28]:
# Example of merging with another hypothetical dataset
# Create a sample demographics dataset
demographics_data = {
    'PersonID': [5887, 20661, 17856, 3730, 9908],
    'Age': [34, 45, 29, 50, 38],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Female'],
    'ZIPCode': ['10001', '10002', '10003', '10004', '10005']
}
demographics_df = pd.DataFrame(demographics_data)

In [29]:
# Merge the final diagnoses data with the demographics data on PersonID
merged_df = pd.merge(final_diagnoses, demographics_df, on='PersonID', how='inner')
print(merged_df.head())

   DIAG_ID  PersonID  VISIT_ID  RecordedDate      RecordedTime  DiagDate  \
0        0      5887    120199             3  07:43:51.0000000      19.0   
1     2026      5887    200185           132  00:13:07.0000000     149.0   
2     7704      5887     46109             3  10:56:21.0000000      -1.0   
3    34726      5887    194749           199  12:43:13.0000000     175.0   
4    79689      5887     92771            23  07:26:44.0000000       8.0   

           DiagTime Subcategory  \
0  00:33:00.0000000       Final   
1  10:02:00.0000000       Final   
2  08:24:00.0000000       Final   
3  14:52:00.0000000       Final   
4  09:14:00.0000000       Final   

                                           EventDesc DiagCode  Age Gender  \
0                           Proteinuria, unspecified    R80.9   34   Male   
1                                Paresthesia of skin    R20.2   34   Male   
2  Person with feared health complaint in whom no...    Z71.1   34   Male   
3  Polyhydramnios, third

In [30]:
# Further analysis: Count of diagnoses by age group
age_group_counts = merged_df.groupby('Age')['DiagCode'].count()
print("Diagnosis counts by age group:\n", age_group_counts)

Diagnosis counts by age group:
 Age
29     7
34    41
38    25
45    18
50    56
Name: DiagCode, dtype: int64


In [31]:
# Analysis: Distribution of diagnoses by gender
gender_diag_counts = merged_df.groupby('Gender')['DiagCode'].count()
print("Diagnosis counts by gender:\n", gender_diag_counts)

Diagnosis counts by gender:
 Gender
Female    99
Male      48
Name: DiagCode, dtype: int64


In [34]:
# Example of using groupby to get the average diagnosis date by Subcategory
average_diag_date = dataset.groupby('Subcategory')['DiagDate'].mean()
print("Average diagnosis date by subcategory:\n", average_diag_date)

Average diagnosis date by subcategory:
 Subcategory
Admitting Diagnosis    205.767407
Billing Diagnosis      165.012821
Discharge Diagnosis    250.619321
Final                  220.762934
Other Diagnosis        108.383380
Post-Op Diagnosis      189.000000
Pre-Op Diagnosis       439.300000
Principal Diagnosis    379.285714
Reason For Visit       198.605250
Referring Diagnosis     76.137931
Unknown                210.392597
Working Diagnosis      130.510204
Name: DiagDate, dtype: float64


In [35]:
# Example of using groupby to get the count of diagnoses by PersonID
person_diag_counts = dataset.groupby('PersonID')['DiagCode'].count()
print("Diagnosis counts by person:\n", person_diag_counts)

Diagnosis counts by person:
 PersonID
1        2946
2           7
3          23
4          12
5          12
         ... 
32909       6
32910       7
32911       5
32912       9
32913     833
Name: DiagCode, Length: 32913, dtype: int64


In [36]:
# Display top 10 persons with most diagnoses
top_10_persons = person_diag_counts.sort_values(ascending=False).head(10)
print("Top 10 persons with most diagnoses:\n", top_10_persons)

Top 10 persons with most diagnoses:
 PersonID
1        2946
32913     833
7540      112
7993      111
8041      109
4684      106
7901      106
4072      106
8013      106
8419      105
Name: DiagCode, dtype: int64
