# Trevor Maxwell
## Follow-Up Treatment Prediction - Data Preparation
### D20230722

In [1]:
# import necessary libraries

import pandas as pd
import numpy as np

In [2]:
# import dataframe

patient_df = pd.read_csv('data-ori.csv')

# preview dataframe
patient_df.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out


### Lowercase column headers
 - lowercasing column header names will ease programming mistakes

In [3]:
# view column names prior to cleaning

print(patient_df.columns)

Index(['HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE',
       'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE', 'SEX', 'SOURCE'],
      dtype='object')


In [4]:
# lowercase column headers
patient_df.columns = [x.lower() for x in patient_df.columns]

In [5]:
# view column names after lowercasing header

print(patient_df.columns)

Index(['haematocrit', 'haemoglobins', 'erythrocyte', 'leucocyte',
       'thrombocyte', 'mch', 'mchc', 'mcv', 'age', 'sex', 'source'],
      dtype='object')


### Check for NaN values in dataframe

In [6]:
# sum number of NaN values across all columns
patient_df.isna().sum()

haematocrit     0
haemoglobins    0
erythrocyte     0
leucocyte       0
thrombocyte     0
mch             0
mchc            0
mcv             0
age             0
sex             0
source          0
dtype: int64

No NaN values in any columns in the dataframe.

### Create dummy variable for sex
 - sex is a string variable (M or F) and needs to be numeric for the classification models

In [8]:
# create dummies
sex_dummies = pd.get_dummies(patient_df['sex'], prefix = 'sex')

# add dummies to dataframe
patient_df[['sex_f', 'sex_m']] = sex_dummies

# drop original sex column
patient_df1 = patient_df.drop(columns = ['sex', 'sex_f'], axis = 1)

# preview dataframe after creation of dummies and dropping sex column
patient_df1.head()

Unnamed: 0,haematocrit,haemoglobins,erythrocyte,leucocyte,thrombocyte,mch,mchc,mcv,age,source,sex_m
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,out,0
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,out,0
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,out,0
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,out,0
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,out,1


### Ensure datatypes for columns are correct
 - numeric columns should be numeric and string columns (the target, 'source') is object

In [9]:
# check datatypes

print(patient_df1.dtypes)

haematocrit     float64
haemoglobins    float64
erythrocyte     float64
leucocyte       float64
thrombocyte       int64
mch             float64
mchc            float64
mcv             float64
age               int64
source           object
sex_m             uint8
dtype: object


All datatypes are correct and none need to be changed.

### Remove patients under the age of 18
 - patients under the age of 18 are considered pediatric which calls for a different practice of medicine compared to adults, therefore the focus will be on adults.  

In [10]:
# view shape prior to removing patients
print(patient_df1.shape)

# check age range in dataframe prior to removing patients
print(patient_df1.age.unique())

(4412, 11)
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 98 99]


In [11]:
# create dataframe of patients who are 18 years of age or older

patient_df2 = patient_df1[patient_df1['age'] >= 18]

In [12]:
# view shape of dataframe after removing patients
print(patient_df2.shape)

# check age range in dataframe after removing patients
print(patient_df2.age.unique())

(3971, 11)
[18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
 90 91 92 93 98 99]


In [13]:
# get difference in dataframe size after removing patients

diff = patient_df1.shape[0] - patient_df2.shape[0]

print(f'Number of observations removed due to age: {diff}')

Number of observations removed due to age: 441


### Check for duplicate rows
 - duplicates can be difficult to find and need to be removed if found

In [14]:
# create dataframe of duplicates if found

patient_df2[patient_df2.duplicated]

Unnamed: 0,haematocrit,haemoglobins,erythrocyte,leucocyte,thrombocyte,mch,mchc,mcv,age,source,sex_m


No duplicates were found.

### Feature Removal Discussion
 - No features will be removed during this analysis. There are only 10 features (not including the target, source) and all features, aside from age and gender, are results of laboratory tests from the patient. The removal of a feature(s) will result in loss of data. 

In [15]:
# export new dataframe after cleaning/transformations

patient_df2.to_csv('data-ori_cleaned.csv', index = False)