# Model Building

In [None]:
# Import the neccessary packages
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy

In [None]:
# Ignore the unnecssary warnings ( like software update, bugs, and so on so fourth)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set the display max columns datasets
pd.set_option('display.max_columns',None)

In [None]:
# Read the dataset
data = pd.read_csv('datasets.csv')

In [None]:
# Remove unwanted features in this dataset.
del data['Patient Id']
del data['Family Name']
del data["Father's name"]
del data["Father's age"]
del data["Institute Name"]
del data['Location of Institute']
del data['Status']
del data['Birth asphyxia']
del data['H/O radiation exposure (x-ray)']
del data['History of anomalies in previous pregnancies']
del data['No. of previous abortion']
del data['Birth defects']
del data['Test 1']
del data['Test 2']
del data['Test 3']
del data['Test 4']
del data['Test 5']
del data['Symptom 1']
del data['Symptom 2']
del data['Symptom 3']
del data['Symptom 4']
del data['Symptom 5']
del data['Patient First Name']
del data["Mother's age"]
del data["Autopsy shows birth defect (if applicable)"]
del data["Place of birth"]
del data["Disorder Subclass"]

In [None]:
# Check the top values
data.head()

In [None]:
# Remove the null values
df = data.dropna()

In [None]:
# Check the total number of null values.
df.isnull().sum()

In [None]:
# Transform the datasets  string value into integers values or float value(Label encoder)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

col = ["Patient Age", "Genes in mother's side",'White Blood cell count (thousand per microliter)', 'Paternal gene','Blood cell count (mcL)', 'Respiratory Rate (breaths/min)', 'Heart Rate (rates/min','Parental consent', 'Follow-up', 'Gender', 'Folic acid details (peri-conceptional)','H/O serious maternal illness', 'H/O substance abuse', 'Assisted conception IVF/ART', 'Blood test result', 'Genetic Disorder','Inherited from father', 'Maternal gene']

for i in col:
    df[i] = le.fit_transform(df[i]).astype(int)

In [None]:
# Check the data relationships
df.head()

In [None]:
# Check the unique values in specific columns
df['Genetic Disorder'].unique()

In [None]:
x1 = df.drop(labels='Genetic Disorder',axis=1)
y1 = df.loc[:,'Genetic Disorder']

In [None]:
import imblearn
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros =RandomOverSampler(random_state=1)
x,y=ros.fit_resample(x1,y1)
print("OUR DATASET COUNT         : ", Counter(y1))
print("OVER SAMPLING DATA COUNT  : ", Counter(y))

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(x,y, test_size=0.20, shuffle=False) 

In [None]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier()
clf.fit(x_train, y_train)
predicted =clf.predict(x_test)


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,predicted)
print('THE CONFUSION MATRIX SCORE OF BAGGING CLASSIFIER:\n\n\n',cm)

In [None]:
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(clf, x1, y1, scoring='accuracy')
print('THE CROSS VALIDATION TEST RESULT OF ACCURACY :\n\n\n', accuracy*100)

In [None]:
from sklearn.metrics import accuracy_score
a = accuracy_score(y_test,predicted)
print("THE ACCURACY SCORE OF BAGGING CLASSIFIER IS :",a*100)

In [None]:
from sklearn.metrics import hamming_loss
hl = hamming_loss(y_test,predicted)

print("THE HAMMING LOSS OF BAGGING CLASSIFIER IS :",hl*100)

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, predicted)
print("Classification Report\n\n",report)

In [None]:
import numpy as np
def plot_confusion_matrix(cm, title='THE CONFUSION MATRIX SCORE OF BAGGING CLASSIFIER\n\n', cmap=plt.cm.Blues):
    target_names=['']
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

cm=confusion_matrix(y_test, predicted)
print('THE CONFUSION MATRIX SCORE OF BAGGING CLASSIFIER:\n\n')
print(cm)

sns.heatmap(cm/np.sum(cm), annot=True, cmap = 'cividis', annot_kws={"size": 16},fmt='.2%')
plt.show()

In [None]:
def graph():
    import matplotlib.pyplot as plt
    data=[a]
    alg="BAGGING CLASSIFIER"
    plt.figure(figsize=(5,5))
    b=plt.bar(alg,data,color=("pink"))
    plt.title("THE ACCURACY SCORE OF BAGGING CLASSIFIER IS\n\n\n")
    plt.legend(b,data,fontsize=9)
graph()