<a href="https://colab.research.google.com/github/vijayrgopu/mcg_pytest/blob/main/mcg_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using Google Collab to Runthis.

Run as many times from starting to end and it works perfectly.

Steps to Run are given below:


1.   Click on "Runtime" (next to Insert on the top left corner)
2.   Click on "Run All" (Also can use CRTL + F9)





# Remove any previously run files and start with fresh download

In [None]:
!rm *.csv
!rm *.json*

# Download data files from dropbox

In [None]:
!wget https://www.dropbox.com/scl/fi/5vz0kvjh0czm2ofuqjbzk/client_1_patients.json?rlkey=p8o0i9t2ee7x1iifsorj1aqms&dl=0
!wget https://www.dropbox.com/scl/fi/jd39dqp9vh4slzb4i8ab3/client_2_episodes.json?rlkey=zcp1mfnjdx9p59lvny9825312&dl=0

# Check If the files have been Downloaded

In [None]:
!ls -ltr

# Rename the files to correct file names

In [None]:
!mv 'client_1_patients.json?rlkey=p8o0i9t2ee7x1iifsorj1aqms' client_1_patients.json
!mv 'client_2_episodes.json?rlkey=zcp1mfnjdx9p59lvny9825312' client_2_patients.json

# Verify Filename changes

In [None]:
!ls -ltr

# Create a new function for reading client 2 files

Based on my understanding multiple clients can send their own formatted data and we are converting them to out own specific formats to be either loaded to a database for feature engineering purposes or for further downstream processes.

This is modularized by converting client specific code into seperate functions and can be called by a wrapper function that only need to add a new client for combining new clients data into our required format.

In [None]:
import pandas as pd
import json

#Read Client_1 File Type
def parse_client_1():
  # Load JSON data
  with open('client_1_patients.json','r') as f:
      data = json.load(f) # your data here

  patients = []
  episodes = []
  notes = []

  # Iterate through episodes in JSON data
  for patient in data:
      patient_info = {
          "patient_id": patient["patient_id"],
          "name": patient["name"],
          "dob": patient["dob"],
          "ssn": patient["ssn"],
          "address": patient["address"]
      }
      patients.append(patient_info)
      for episode in patient['episodes']:
          episode_info = {
              'patient_id': patient['patient_id'],
              'episode_id': episode['episode_id'],
              'admit_date_time': episode['admit_date_time'],
              'discharge_date_time': episode['discharge_date_time'],
              'admitting_diagnosis': episode['admitting_diagnosis'],
              'discharge_diagnosis': episode['discharge_diagnosis']
          }
          episodes.append(episode_info)
          for note in episode['notes']:
              note_info = {
                  'note_id': note['note_id'],
                  'episode_id': note['episode_id'],
                  'note_date_time': note['note_date_time'],
                  'text': note['text']
              }
              notes.append(note_info)

  # Convert lists to DataFrames
  patients_df_1 = pd.DataFrame(patients)
  episodes_df_1 = pd.DataFrame(episodes)
  notes_df_1 = pd.DataFrame(notes)

  return patients_df_1,episodes_df_1,notes_df_1

#Read Client_2 File Type
def parse_client_2():
  # Load JSON data
  with open('client_2_patients.json','r') as f:
      data = json.load(f) # your data here

  patients = []
  episodes = []
  notes = []

  # Iterate through episodes in JSON data

  for episode in data:
      episode_info = {
          'patient_id': episode['mrn'],
          'episode_id': episode['episode_id'],
          'admit_date_time': episode['admit_time'],
          'discharge_date_time': episode['discharge_date_time'],
          'admitting_diagnosis': episode['admitting_diagnosis'],
          'discharge_diagnosis': episode['discharge_diagnosis']
      }
      episodes.append(episode_info)
      patient_info = {
          "patient_id": episode['patient']['mrn'],
          "name": episode['patient']['name'],
          "dob": episode['patient']['dob'],
          "ssn": episode['patient']['ssn'],
          "address": episode['patient']['address']
      }
      patients.append(patient_info)
      for note in episode['notes']:
        notes_info = {
            "note_id": note['note_id'],
            "episode_id": note['episode_id'],
            "note_date_time": note['note_time'],
            "text": note['clinical_text']
        }
        notes.append(notes_info)

  # Convert lists to DataFrames
  patients_df_2 = pd.DataFrame(patients)
  episodes_df_2 = pd.DataFrame(episodes)
  notes_df_2 = pd.DataFrame(notes)

  return patients_df_2,episodes_df_2,notes_df_2

patients_df_1,episodes_df_1,notes_df_1 = parse_client_1()
patients_df_2,episodes_df_2,notes_df_2 = parse_client_2()

merged_patients = pd.concat([patients_df_1, patients_df_2], ignore_index=True)
merged_episodes = pd.concat([episodes_df_1, episodes_df_2], ignore_index=True)
merged_notes  = pd.concat([notes_df_1, notes_df_2], ignore_index=True)
#merged_patients = pd.merge(patients_df_1, patients_df_2, on=['patient_id'], how='outer')
#merged_episodes = pd.merge(episodes_df_1, episodes_df_2, on=['patient_id','episode_id'], how='outer')
#merged_notes = pd.merge(notes_df_1, notes_df_2, on=['episode_id','note_id'], how='outer')

# Write DataFrames to CSV files
merged_patients.to_csv('patients_combined.csv', index=False)
merged_episodes.to_csv('episodes_combined.csv', index=False)
merged_notes.to_csv('notes_combined.csv', index=False)

# Verify three new combined files have been created

In [None]:
!ls -ltr

In [None]:
!head -5 patients_combined.csv

In [None]:
!head -5 episodes_combined.csv

In [None]:
!head -5 notes_combined.csv

Test the File generated

In [None]:
import pytest
import pandas as pd

data_model = {'note': ['note_id', 'episode_id', 'note_date_time', 'text'],
              'episode': ['episode_id', 'patient_id', 'admit_date_time', 'discharge_date_time', 'admitting_diagnosis', 'discharge_diagnosis'],
              'patient': ['patient_id', 'name', 'dob', 'ssn', 'address']}

def test_patient_csv():
    patients_df = pd.read_csv('patients_combined.csv')
    for field in data_model['patient']:
        assert field in patients_df.columns

def test_note_csv():
    notes_df = pd.read_csv('episodes_combined.csv')
    for field in data_model['note']:
        assert field in notes_df.columns

def test_episode_csv():
    episodes_df = pd.read_csv('notes_combined.csv')
    for field in data_model['episode']:
        assert field in episodes_df.columns