# A-Data Collection and Preparation

## Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Reading the data set

In [None]:
df=pd.read_csv('anemia.csv')
df.head()

## Data Preparation

 #### 1- Handling missing values

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

#### 2- Handling Imbalanced values

In [None]:
# 0 for not Anemia and 1 for Anemia
# checking for the count of anemia and not anemia
results = df['Result'].value_counts()
results.plot(kind = 'bar',color=['blue','green'])
plt.xlabel('Results')
plt.ylabel('Frequency')
plt.title('Count of Result')
plt.show()

In [None]:
# female count is observed to be more than male so we balance it using undesampling
from sklearn.utils import resample
majorclass = df[df['Result'] == 0]
minorclass = df[df['Result'] == 1]
major_downsample = resample(majorclass, replace=False, n_samples=len(minorclass),random_state=42)
df = pd.concat([major_downsample,minorclass])
df['Result'].value_counts()

In [None]:
# Plotting the balanced gender counts
result_balanced = df['Result'].value_counts()
result_balanced.plot(kind = 'bar', color = ['blue','green'])
plt.xlabel('Result')
plt.ylabel('Frequency')
plt.title('Count of Result(Balanced)')
plt.show()

In [None]:
# Plotting the balanced gender counts
result_balanced = df['Result'].value_counts()
result_balanced.plot(kind = 'bar', color = ['blue','green'])
plt.xlabel('Result')
plt.ylabel('Frequency')
plt.title('Count of Result(Balanced)')
plt.show()

# B- Exploratory Data Analysis

#### Descriptive statistical

In [None]:
df.describe

#### Visual Analysis

In [None]:
#Visual analysis 

fig,axis=plt.subplots(2,2)
axis[0,0].plot(df['Hemoglobin'])
axis[0,1].plot(df['MCH'])
axis[1,0].plot(df['MCHC'])
axis[1,1].plot(df['MCV'])

In [None]:
#identifying outliers
sns.boxplot(df['Hemoglobin'], orient='h')
plt.show()

plt.scatter(df['Hemoglobin'],df['MCV'])
plt.show()

plt.scatter(df['MCH'],df['MCHC'])

#### Univariate Analysis

In [None]:
#Univariate Analysis: Bar graph 

output=df['Gender'].value_counts()
output.plot(kind='bar',color=['orange','green'])
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.title('Gender count')
plt.show()

In [None]:
#Univariate analysis: displot

# Hemoglobin

sns.displot(df['Hemoglobin'],kde=True)

#### Bivariate Analysis

In [None]:
plt.figure(figsize = (6,6))
ax = sns.barplot(y = df['Hemoglobin'], x = df['Gender'], hue = df['Result'], ci = None)
ax.set(xlabel = ['male','female'])
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
plt.title("Mean Hemoglobin by Gender and Results")
plt.show()

#### Multivariate Analysis

In [None]:
sns.pairplot(df)

In [None]:
sns.heatmap(df.corr(),annot=True,cmap="RdYlGn",linewidth=0.2)
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

### Splitting Data Into Train And Test

In [None]:
# x - independent variables

x=df.drop('Result',axis=1)
x

In [None]:
# y - dependent variables

y=df['Result']
y

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=20)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# C- Model Building

## Training the model in multiple algorithms

#### 1-Logistic Regression Model 

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

y_pred = logistic_regression.predict(x_test)

acc_lr = accuracy_score(y_test,y_pred)
c_lr = classification_report(y_test,y_pred)

print('Accuracy Score: ',acc_lr)
print(c_lr)

#### 2-Random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)

acc_rf = accuracy_score(y_test,y_pred)
c_rf = classification_report(y_test,y_pred)

print('Accuracy Score: ',acc_rf)
print(c_rf)

#### 3-Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(x_train, y_train)
y_pred = decision_tree_model.predict(x_test)

acc_dt = accuracy_score(y_test,y_pred)
c_dt = classification_report(y_test,y_pred)

print('Accuracy Score: ',acc_dt)
print(c_dt)

#### 4-Gaussian Navies Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB

NB = GaussianNB()
NB.fit(x_train, y_train)
y_pred = NB.predict(x_test)

acc_nb = accuracy_score(y_test,y_pred)
c_nb = classification_report(y_test,y_pred)

print('Accuracy Score: ',acc_nb)
print(c_nb)

#### 5-Support Vector Machine

In [None]:
from sklearn.svm import SVC

support_vector = SVC()
support_vector.fit(x_train, y_train)
y_pred = support_vector.predict(x_test)

acc_svc = accuracy_score(y_test,y_pred)
c_svc = classification_report(y_test,y_pred)

print('Accuracy Score: ',acc_svc)
print(c_svc)

#### 6-Gradient Boosting Classifier 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier()
GBC.fit(x_train, y_train)
y_pred = GBC.predict(x_test)

acc_gbc = accuracy_score(y_test,y_pred)
c_gbc = classification_report(y_test,y_pred)

print('Accuracy Score: ',acc_gbc)
print(c_gbc)

## Testing the model


In [None]:
prediction = GBC.predict([[0,11.6,22.3,30.9,74.5]])

In [None]:
prediction[0]

In [None]:
if prediction[0] == 1:
    print("You don't have any Anemia Disease")
elif prediction[0] == 1:
    print("You have Anemia Disease")

# Performance testing and Hyper Parameter Tunning

#### Testing model with multiple evaluation metrics

In [None]:
model = pd.DataFrame({
    'Model': [
        'Linear Regression',
        'Decision Tree Classifier',
        'Random Forest Classifier',
        'Gaussian Naive Bayes',
        'Support Vector Classifier',
        'Gradient Boost Classifier'
    ],
    'Score': [acc_lr,acc_dt,acc_rf,acc_nb,acc_svc,acc_gbc],
})

In [None]:
model

# Model Deployment

#### Save the best Model

In [None]:
## I used Gradient Boost Classifier for higher accuracy and more precise results

import pickle
import warnings
pickle.dump(GBC,open("model.pkl","wb"))

# Model is succesfully created and save in the directories
# Now intregrate saved model with web framework through which user will intract and check their anemia aduicted or not