## Installation of Dependencies:

In [1]:
# Import the required Operating System, Numpy, and Tensorflow:

import os
import numpy as np
import tensorflow as tf

In [2]:
# Import Pandas:

import pandas as pd

In [3]:
# Import Keras dependencies:

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist

In [4]:
# Possibly useful Machine Learning libraries:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import math

## Data Import

In [5]:
# Import our data from CSVs stored in AWS:

train_df = pd.read_csv('https://brandon-12-07-2022.s3.ap-southeast-2.amazonaws.com/Training.csv')
test_df = pd.read_csv('https://brandon-12-07-2022.s3.ap-southeast-2.amazonaws.com/Testing.csv')

In [6]:
# Show first 2 records of training dataset:

train_df.head(2)

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,


In [7]:
# Show first 2 records of testing dataset:

test_df.head(2)

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy


## Data ETL

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 134 entries, itching to Unnamed: 133
dtypes: float64(1), int64(132), object(1)
memory usage: 5.0+ MB


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Columns: 133 entries, itching to prognosis
dtypes: int64(132), object(1)
memory usage: 43.8+ KB


In [10]:
# Remove unwanted columns in the raw data:

del train_df["Unnamed: 133"]

In [11]:
# Remove any rows where a value is NaN:

cleaned_train_df=train_df.dropna(how='any')
cleaned_test_df=test_df.dropna(how='any')

In [12]:
cleaned_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42 entries, 0 to 41
Columns: 133 entries, itching to prognosis
dtypes: int64(132), object(1)
memory usage: 44.0+ KB


In [13]:
cleaned_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4920 entries, 0 to 4919
Columns: 133 entries, itching to prognosis
dtypes: int64(132), object(1)
memory usage: 5.0+ MB


In [14]:
Vsearch=cleaned_train_df.loc[cleaned_train_df['prognosis']=='(vertigo) Paroymsal  Positional Vertigo', :]
Vsearch.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
361,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
364,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo


In [15]:
cleaned_train_df['prognosis'].unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [16]:
# Rename untidy prognoses within the TWO dataframes:

cleaned_test_df['prognosis'].replace('(vertigo) Paroymsal  Positional Vertigo','Vertigo', inplace=True)
cleaned_train_df['prognosis'].replace('(vertigo) Paroymsal  Positional Vertigo','Vertigo', inplace=True)
cleaned_test_df['prognosis'].replace('Dimorphic hemmorhoids(piles)','Dimorphic hemmorhoids', inplace=True)
cleaned_train_df['prognosis'].replace('Dimorphic hemmorhoids(piles)','Dimorphic hemmorhoids', inplace=True)
cleaned_test_df['prognosis'].replace('hepatitis A','Hepatitis A', inplace=True)
cleaned_train_df['prognosis'].replace('hepatitis A','Hepatitis A', inplace=True)
cleaned_test_df['prognosis'].replace('Peptic ulcer diseae','Peptic ulcer disease', inplace=True)
cleaned_train_df['prognosis'].replace('Peptic ulcer diseae','Peptic ulcer disease', inplace=True)
cleaned_test_df['prognosis'].replace('Diabetes ','Diabetes', inplace=True)
cleaned_train_df['prognosis'].replace('Diabetes ','Diabetes', inplace=True)
cleaned_test_df['prognosis'].replace('Hypertension ','Hypertension', inplace=True)
cleaned_train_df['prognosis'].replace('Hypertension ','Hypertension', inplace=True)

In [17]:
cleaned_train_df['prognosis'].unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer disease', 'AIDS', 'Diabetes',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'Hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis', 'Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [18]:
cleaned_test_df['prognosis'].unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer disease', 'AIDS', 'Diabetes',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'Hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis', 'Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [19]:
# Create a list of all unique prognoses:

prognoses_train_df=pd.DataFrame(cleaned_train_df["prognosis"])
prognoses_unique_df=prognoses_train_df.drop_duplicates(keep='first')
prognoses_unique_df.head()

Unnamed: 0,prognosis
0,Fungal infection
10,Allergy
20,GERD
30,Chronic cholestasis
40,Drug Reaction


In [20]:
# Sort into alphbetical order:

prognoses_unique_sorted_df=prognoses_unique_df.sort_values(by='prognosis')
prognoses_unique_sorted_df.reset_index(drop=True, inplace=True)
prognoses_unique_sorted_df.head()

Unnamed: 0,prognosis
0,AIDS
1,Acne
2,Alcoholic hepatitis
3,Allergy
4,Arthritis


In [21]:
# Match all prognoses with an ID (unique integer):

prognosis_id=[]

for x in range(1, 42):
    prognosis_id.append(x)

print(prognosis_id)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]


In [22]:
prognosis_id_df=pd.DataFrame({"Prognosis_ID": prognosis_id})


In [23]:
prognosis_key_df=prognosis_id_df.join(prognoses_unique_sorted_df, lsuffix='Prognosis_ID', rsuffix='prognosis')
prognosis_key_df.head()

Unnamed: 0,Prognosis_ID,prognosis
0,1,AIDS
1,2,Acne
2,3,Alcoholic hepatitis
3,4,Allergy
4,5,Arthritis


In [24]:
# Add a 'prognosis_ID' column to train_df:
cleaned_test_df['prognosis'].replace(
    
    ['AIDS', 'Acne', 'Bronchial Asthma',
     'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Cervical spondylosis', 'Chicken pox', 
     'Chronic cholestasis', 'Common Cold', 'Dengue', 'Diabetes', 'Dimorphic hemmorhoids', 
     'Drug Reaction', 'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack', 'Hepatitis A',
     'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Hypertension', 'Hyperthyroidism',
     'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine', 'Osteoarthristis',
     'Paralysis (brain hemorrhage)', 'Peptic ulcer disease', 'Pneumonia', 'Psoriasis', 'Tuberculosis',
     'Typhoid', 'Urinary tract infection', 'Varicose veins', 'Vertigo'], 
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 
       27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41], inplace=True)

cleaned_test_df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,16
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,14


In [25]:
search=cleaned_train_df.loc[cleaned_train_df['prognosis']=='(vertigo) Paroymsal  Positional Vertigo', :]
search.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis


In [26]:
# Add a 'prognosis_ID' column to test_df:

cleaned_train_df['prognosis'].replace(
    
    ['AIDS', 'Acne', 'Bronchial Asthma',
     'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Cervical spondylosis', 'Chicken pox', 
     'Chronic cholestasis', 'Common Cold', 'Dengue', 'Diabetes', 'Dimorphic hemmorhoids', 
     'Drug Reaction', 'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack', 'Hepatitis A',
     'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Hypertension', 'Hyperthyroidism',
     'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine', 'Osteoarthristis',
     'Paralysis (brain hemorrhage)', 'Peptic ulcer disease', 'Pneumonia', 'Psoriasis', 'Tuberculosis',
     'Typhoid', 'Urinary tract infection', 'Varicose veins', 'Vertigo'], 
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 
       27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41], inplace=True)

cleaned_train_df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15


In [27]:
search=cleaned_test_df.loc[cleaned_test_df['prognosis']==25, :]
search.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25


## Machine Learning

In [28]:
# For each row in the training and testing data:

# (a) Assign the "symptoms" data to X values (independent variables)
# (b) Assign the "prognosis" to a Y value (dependent variable)

X_train = cleaned_train_df.iloc[:, :-1].values 
y_train = cleaned_train_df.iloc[:, 132].values 
X_test = cleaned_test_df.iloc[:, :-1].values 
y_test = cleaned_test_df.iloc[:, 132].values

In [29]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4920, 132)
(4920,)
(42, 132)
(42,)
