In [None]:
#Step 01: Import the required libraries
%config IPCompleter.greedy=True
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #Python data visualization library form matplotlib

In [None]:
#Set Pandas Options to show the data
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [None]:
#Step 02: Load the dataset
data_frame = pd.read_csv('monkeypox_dataset.csv')

#List down the columns
data_frame.columns

In [None]:
#Step 03: Retain the required variables
retained_variables = [
    'Test ID','Systemic Illness', 'Encoded Systemic Illness',
       'Rectal Pain', 'Sore Throat', 'Penile Oedema', 'Oral Lesions',
       'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection',
       'Red blood cells count', 'White blood cells count',
       'Age', 'Health Insurance',
       'Sexually Transmitted Infection', 'MPOX PCR Result'
]

data_frame_with_retained_variables = data_frame[retained_variables]
display(data_frame_with_retained_variables)#Test the code (remove later)

In [None]:
#Step 04: Basic Statistical Description
# description = data_frame_with_retained_variables.describe(include='all')
description = data_frame_with_retained_variables.describe()
display(description)

In [None]:
#Step 05: Measurement Scale Type
measurement_scale = data_frame_with_retained_variables.dtypes
display(measurement_scale)

In [None]:
#Step 06: Plot the distribution of class variables
sns.countplot(x = 'MPOX PCR Result', data=data_frame_with_retained_variables)
plt.xlabel('MPOX PCR Result')
plt.ylabel('Count')
plt.title('Distribution of PCR Test Result')
plt.show()

In [None]:
data_frame_with_retained_variables.hist(bins = 60, figsize = (20,20))

# Task 03: Identifying and Fixing the Variable Issues

In [31]:
data_frame_with_retained_variables.isna().sum()

Test ID                              0
Systemic Illness                  6216
Encoded Systemic Illness             2
Rectal Pain                          3
Sore Throat                          0
Penile Oedema                        6
Oral Lesions                         4
Solitary Lesion                      0
Swollen Tonsils                      7
HIV Infection                        5
Red blood cells count                0
White blood cells count              0
Age                                 36
Health Insurance                     0
Sexually Transmitted Infection       4
MPOX PCR Result                      0
dtype: int64

In [27]:
#Systemic Illness | Mistaken value
data_frame_with_retained_variables[data_frame_with_retained_variables['Systemic Illness']=='fever']

Unnamed: 0,Test ID,Systemic Illness,Encoded Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Red blood cells count,White blood cells count,Age,Health Insurance,Sexually Transmitted Infection,MPOX PCR Result
33,AA0054,fever,1.0,0.0,0,0.0,1,0,0.0,0.0,4829403,9994,36,1,1.0,Negative


In [46]:
#Fixing the Systemic Illness
mask = (data_frame_with_retained_variables['Systemic Illness'] == 'fever') 
data_frame_with_retained_variables.loc[mask, 'Systemic Illness'] = 'Fever'
data_frame_with_retained_variables[data_frame_with_retained_variables['Systemic Illness']=='fever']


Unnamed: 0,Test ID,Systemic Illness,Encoded Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Red blood cells count,White blood cells count,Age,Health Insurance,Sexually Transmitted Infection,MPOX PCR Result


In [45]:
#Last check to verify if all the data issues has fixed 
data_frame_with_retained_variables.isna().sum()

Test ID                              0
Systemic Illness                  6216
Encoded Systemic Illness             2
Rectal Pain                          3
Sore Throat                          0
Penile Oedema                        6
Oral Lesions                         4
Solitary Lesion                      0
Swollen Tonsils                      7
HIV Infection                        5
Red blood cells count                0
White blood cells count              0
Age                                 36
Health Insurance                     0
Sexually Transmitted Infection       4
MPOX PCR Result                      0
dtype: int64

In [None]:
#Look for Null Values 
#data_frame_with_retained_variables.isnull().sum()
data_frame_with_retained_variables.isna().sum()

In [None]:
#Check the rows with 'Age' columns is null or not avaialble
data_frame_with_retained_variables[data_frame_with_retained_variables['Age'].isna()]

In [None]:
red_blood_cells = data_frame_with_retained_variables[data_frame_with_retained_variables['Red blood cells count'] > 5800000]

In [None]:
white_blood_cells = data_frame_with_retained_variables[data_frame_with_retained_variables['White blood cells count'] > 1]

In [None]:
red_blood_cells[['Test ID', 'Red blood cells count']].plot(kind='scatter', x='Test ID', y='Red blood cells count')

In [None]:
white_blood_cells[['Test ID', 'White blood cells count']].plot(kind='scatter', x='Test ID', y='White blood cells count')

In [34]:
systemic_illness = data_frame_with_retained_variables[data_frame_with_retained_variables['Systemic Illness'].notnull()]

# Count the occurrences of each unique value in 'Systemic Illness'
counted_systemic_illness = systemic_illness['Systemic Illness'].value_counts()


In [None]:
counted_systemic_illness.plot(kind='bar', figsize=(10, 6))

In [None]:
data_frame_with_retained_variables['Encoded Systemic Illness'].value_counts()

In [None]:
data_frame_with_retained_variables.duplicated().sum()

In [None]:
data_frame_with_retained_variables.value_counts()

In [None]:
binary_data_frame = data_frame_with_retained_variables[[
       'Rectal Pain', 'Sore Throat','Penile Oedema',
       'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection',
       'Sexually Transmitted Infection'
]]
binary_data_frame
binary_data_frame.apply(pd.Series.value_counts)


## Data Cleaning

In [None]:
data_frame