### <span style="font-size:22px;">Synthea Health: Data Analysis and Insights Generation</span>


### 1. Initialisation

In [None]:
!pip install -r ./requirements.txt

### 2. Data Wrangling & Cleaning

#### 1. Import the required functions for reading CSV files, Cleaning data and transformations

In [None]:
from utils.utils import read_data,clean_patient_data,transform_patient_data,clean_transform_condition_data,delete_from_data,clean_transform_procedure_data,clean_transform_medication_data,clean_transform_encounters_data
import json

In [None]:
# Read the file path from Config file to retrive the Input folder where all Source files are maintained
config_file_path = r'C:\Users\veena.vemula\Documents\GitHub\SDE-Skills-Test\Config\config.json'

# Load the JSON file
with open(config_file_path, 'r') as file:
    config = json.load(file)

# Access the value of 'input_folder'
input_folder = config.get('input_folder')
print("Input Folder:", input_folder)

In [None]:
# Call the read function to load patients data
df_patients = read_data(input_folder,"patients")
df_conditions = read_data(input_folder,"conditions")
df_encounters = read_data(input_folder,"encounters")
df_medications = read_data(input_folder,"medications")
df_procedures = read_data(input_folder,"procedures")
df_organizations = read_data(input_folder,"organizations")

# Check the result (show the first few rows)
if df_patients is not None:
    print(df_patients.head())
else:
    print("Failed to load the Patients CSV file.")

# Check the result (show the first few rows)
if df_conditions is not None:
    print(df_conditions.head())
else:
    print("Failed to load the Conditions CSV file.")

# Check the result (show the first few rows)
if df_encounters is not None:
    print(df_encounters.head())
else:
    print("Failed to load the Encounters CSV file.")

# Check the result (show the first few rows)
if df_medications is not None:
    print(df_medications.head())
else:
    print("Failed to load the medications CSV file.")

# Check the result (show the first few rows)
if df_procedures is not None:
    print(df_procedures.head())
else:
    print("Failed to load the procedures CSV file.")

# Check the result (show the first few rows)
if df_organizations is not None:
    print(df_organizations.head())
else:
    print("Failed to load the organizations CSV file.")



In [None]:
# List of DataFrames
dataframes = {
    "Patients": df_patients,
    "conditions": df_conditions ,
    "encounters": df_encounters,
    "Medications":df_medications,
    "Procedures": df_procedures,
    "organizations" :df_organizations
}

# Loop through all DataFrames and check for duplicates
for name, df in dataframes.items():
    num_duplicates = df.duplicated().sum()
    
    if num_duplicates > 0:
        print(f"Number of duplicate rows in {name} file: {num_duplicates}")
    else:
        print(f"No duplicate values found in {name} file")


#### Cleaning and trnansforming the Patients file

In [None]:
""" Use Value_counts() to inspect the data and identify if any paterns """

df_patients['ADDRESS'].value_counts()
df_patients['FIRST'].value_counts()


Inspecting the Value counts on each of columns dint revela any significant data issues or invalid categorical columns and inconsistent data

In [None]:
missing_values = df_patients .isnull().sum()
print("Missing values in the DataFrame:")
print(missing_values)

By inspecting the Missing values in patients file
1) Deat code can be blank if the patient is alive
2) Drivers,Passport and Prefix , Suffic Maiden column have null's but at this stage these columns are not significant 

In [None]:
## Clean patient data  
df_patients = clean_patient_data(df_patients)
df_patients.head()

Check for any duplicate column values
Rename the key value to Patientid - This has to be mainted same across all dataframes
Retrived the required columns and renamed them to make more sense.

In [None]:
print("\nData types of each column:")
df_patients.dtypes
## all the columns are of object type. we can cast it to desired type for e.g. convert BIRTHDATE to date
## GENDER, RACE, ETHNICITY ... to string

In [None]:
df_patients = transform_patient_data(df_patients)
df_patients.dtypes

Transformations have been applied step by step as below.
1) Tranform Birthdatetime to required format to calculate patients age.
2) Type casting to convert object types to strings i.e gender, Race and Ethinicity
3) Describe age bins and label

In [None]:
df_patients.to_csv("./output/patients_cleaned.csv",index=False)

Export the cleaned file to output folder

###Cleaning and Transforming Conditions file

In [None]:
missing_values = df_conditions.isnull().sum()
print("Missing values in the DataFrame:")
print(missing_values)

Inspecting the Missing values Stop can be null as patient can have a disease diagnosed for life time cure example life style diseases like Hypertention

In [None]:
print("\nData types of each column:")
df_conditions.dtypes
## all the columns are of object type. we can cast it to desired type for e.g. convert date to date

In [None]:
df_conditions["DESCRIPTION"].unique()
## few desc does not seems to be like disease name so removing it
## part-time employment (finding),
#Received higher education (finding)',
#      'Reports of violence in the environment (finding)',
#Has a criminal record (finding),
#Received certificate of   high school equivalency (finding)
#Full-time employment (finding)
list_invalid_values = ["Received certificate of high school equivalency (finding)","Full-time employment (finding)",
                      "Part-time employment (finding)","Part-time employment (finding)","Received higher education (finding)",
                      "Reports of violence in the environment (finding)","Received higher education (finding)"
                      ]
df_conditions = delete_from_data(df_conditions,"DESCRIPTION",list_invalid_values)
df_conditions["DESCRIPTION"].unique() ## removed all above

In [None]:
df_conditions = clean_transform_condition_data(df_conditions)
df_conditions.head()

Applied the transformations 
Renaming the key columns
Convert Start and stop to required date time column types
Rename the columns

In [None]:
df_conditions.to_csv("./output/conditions_cleaned.csv",index=False)

##Export the cleaned data to output folder

In [None]:
###### WORKING ON medication DATA 

In [None]:
print("\nData types of each column:")
df_medications.dtypes

In [None]:
missing_values = df_medications.isnull().sum()
print("Missing values in the DataFrame:")
print(missing_values)

No data in Stop column indicates the medications can be life time as an example Hypertention. ReasonCode and Reason description are not relevant fields at this atge of analysis

In [None]:
df_medications["DESCRIPTION"].unique()

#Medication description data is clean and no Issues identified

In [None]:
df_medications = clean_transform_medication_data(df_medications)
df_medications.head()

Below transformation have been applied on Medications data
1) rename the key ID to patient ID
2) Convert starta and stop to date time
3) retrive only relevent columns and renamed the columns

#### Clean and transform Procedures data

In [None]:
print("\nData types of each column:")
df_procedures.dtypes

In [None]:
missing_values = df_procedures.isnull().sum()
print("Missing values in the DataFrame:")
print(missing_values)

In [None]:
df_procedures = clean_transform_procedure_data(df_procedures)
df_procedures

Below transformation have been applied on Procedures data

1)rename the key ID to patient ID
2)Convert starta and stop to date time
3)retrive only relevent columns and renamed the columns

In [None]:
df_procedures.to_csv("./output/procedures_cleaned.csv",index=False)

Working on Encounters data and applying the silimar steps
1) Inspecting the value counts and checking for missing values
2) type casting to string from object type
3) converting starta nd stop to date time
4) output the cleaned csv to output folder

In [None]:
print("\nData types of each column:")
df_encounters.dtypes

In [None]:
missing_values = df_encounters.isnull().sum()
print("Missing values in the DataFrame:")
print(missing_values)

In [None]:
missing_values = df_organizations.isnull().sum()
print("Missing values in the DataFrame:")
print(missing_values)

In [None]:
df_encounters = clean_transform_encounters_data(df_encounters)
df_encounters

In [None]:
df_encounters.to_csv("./output/encounters_cleaned.csv",index=False)

#read Hospital data expecting this data can be used for analysis purpose

In [None]:
df_organizationdata = df_organizations[['Id','NAME','ADDRESS','CITY','STATE','ZIP']]

string_cols = ['Id','NAME','ADDRESS','CITY','STATE']
for i in string_cols:
    df_organizationdata[i] = df_organizationdata[i].astype('string')

df_organizationdata['ZIP'] = df_organizationdata['ZIP'].astype('float64')



In [None]:
""" Transformations"""
df_daignosisdatamerge = pd.merge(df_patientsdata, df_conditionsdata, on='PatientId', how='left')

df_diagnosisdata.drop(df_diagnosisdata.index, inplace=True)
# Select specific columns
df_diagnosisdata = df_daignosisdatamerge[['BIRTHDATE','START', 'STOP', 'PatientId', 'ENCOUNTER', 'DiagnosisCode',
       'DESCRIPTION']]


In [None]:
df_diagnosisdata.columns


In [None]:
df_daignosisdata['AGE_AT_DIAGNOSIS'] = ( df_daignosisdata['START']-df_daignosisdata['BIRTHDATE'] ).dt.days // 365

In [None]:
pip install matplotlib

In [None]:

import matplotlib.pyplot as plt

df_daignosisdatahist = df_daignosisdata[['AGE_AT_DIAGNOSIS' ,'PatientId']]
# Create histogram
plt.hist(df_daignosisdatahist, bins=10, edgecolor='black')

# Add labels and title
plt.xlabel(df_daignosisdatahist['AGE_AT_DIAGNOSIS'])
"""plt.ylabel(df_daignosisdatahist['PatientId'].value_counts())"""
plt.title('Histogram Example')

# Show the plot
plt.show()


In [None]:
df_daignosisdatahist['PatientId'].value_counts()

In [None]:
"""Define Age Groups"""


# Define bins and labels
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, float('inf')]
labels = ['0-5', '5-10', '10-15', '15-20', '20-25', '25-30', '30-35', '35-40', '40-45',
          '45-50', '50-55', '55-60', '60-65', '65-70', '70-75', '75-80', '80-85', '85-90', '90+']

# Categorize the ages
df_daignosisdata['Age Group'] = pd.cut(df_daignosisdata['AGE_AT_DIAGNOSIS'], bins=bins, labels=labels, right=False)




In [None]:
df_daignosisdata.head()

In [None]:
""" Length of Stay"""
df_encounterdata.head()

In [None]:
""" Transformations"""
df_encounterdatamerge = pd.merge(df_patientsdata, df_encounterdata, on='PatientId', how='left')


# Select specific columns
df_hospitalvisitdata = df_encounterdatamerge[['BIRTHDATE','Id', 'START', 'STOP', 'PatientId', 'ORGANIZATION', 'ENCOUNTERCLASS',
       'CODE', 'DESCRIPTION']]
df_hospitalvisitdata.rename(columns={'Id': 'EncounterID'}, inplace=True)


In [None]:
df_hospitalvisitdata.dtypes

In [None]:
df_hospitalvisitdata['START'] = df_hospitalvisitdata['START'].dt.tz_localize(None)

df_hospitalvisitdata['AGE_AT_HospitalVisit'] = ( df_hospitalvisitdata['START']-df_hospitalvisitdata['BIRTHDATE'] ).dt.days // 365

In [None]:
df_hospitalvisitdata['STOP'] = df_hospitalvisitdata['STOP'].dt.tz_localize(None)
df_hospitalvisitdata['length of Stay'] = (df_hospitalvisitdata['STOP'] - df_hospitalvisitdata['START']).dt.days


In [None]:
df_encounterdata.head()

In [None]:
/** NUmber of visits per patient
df_encounter