# Importing necessary modules

In [1]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

## Using the variables to get data in dataframes
>The abspath makes the folder appear inspite of location of the parent folder

In [2]:
raw_data_path = os.path.abspath('../data/00_raw')
interim_data_path = os.path.abspath('../data/01_interim')

__This additional step is used to clear out the excess files from the system after, the data has been extracted and stored within a SQL Database__. However the `src/data_acquisition/01_data_acquisition.py` needs to be run first as the file download verification shall fail after this.

In [3]:
for file in os.listdir(raw_data_path):
    if file.lower().endswith('.zip'):
        file_path = os.path.join(raw_data_path, file)
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f"Deleted:{file}")

## Loading all the data inside dataframes to better use this
- This approach makes the implementation fool-proof and implements modularity.
- All the files are read and handled smartly without leaving any unhandled cases.
- All common formats are accounted for including `".csv", ".xlsx", ".parquet", ".pkl", ".xpt"`.

In [6]:
dfs = {}
for fname in os.listdir(raw_data_path):
    read_path = os.path.join(raw_data_path, fname)
    key = Path(fname).stem # filename without the suffix
    if fname.lower().endswith('.csv'):
        dfs[key] = pd.read_csv(read_path)
    elif fname.lower().endswith('.xpt'):
        dfs[key] = pd.read_sas(read_path)
    elif fname.lower().endswith('.xlsx'):
        dfs[key] = pd.read_excel(read_path)
    elif fname.lower().endswith('.parquet'):
        dfs[key] = pd.read_parquet(read_path)
    elif fname.lower().endswith('.pkl'):
        dfs[key] = pd.read_pickle(read_path)
    else:
        print('The file type provided is not yet handeled in our service please contact the developer.')

In [7]:
dfs['diabetic_data'].head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


# Run this at last
- After completing all the processing run the this cell to finally make all the results store inside the interim process
- Make sure to implement all the operations being done here to a separate file to ensure relevance and competance without requirement of running each cell in this file.

In [None]:
# Makes the files store in primary processed form in the interim folder
for frame in dfs:
    dfs[frame].to_csv(os.path.join(interim_data_path, f'{frame}_primary_processed_data.csv'), index=False)