# Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

%matplotlib inline

# Explore and Clean Dataset

## Leukemia Classes Data

In [2]:
cancer_type_df = pd.read_csv('cancer_genetics.csv')

In [3]:
cancer_type_df.head()

Unnamed: 0,patient,cancer
0,1,ALL
1,2,ALL
2,3,ALL
3,4,ALL
4,5,ALL


In [4]:
cancer_type_df['cancer'].value_counts()

ALL    47
AML    25
Name: cancer, dtype: int64

#### Since cancer types are classified as ALL and AML, we are going to translate these values to numerical inputs.

In [5]:
cancer_type_df = cancer_type_df.replace({'ALL':0, 'AML':1})

In [6]:
cancer_type_df['cancer'].value_counts()

0    47
1    25
Name: cancer, dtype: int64

## Genetics Training Data

In [7]:
df_train = pd.read_csv('cancer_genetics_train.csv')

In [8]:
df_train.tail()

Unnamed: 0,Gene Description,Gene Accession Number,1,call,2,call.1,3,call.2,4,call.3,...,29,call.33,30,call.34,31,call.35,32,call.36,33,call.37
7124,PTGER3 Prostaglandin E receptor 3 (subtype EP3...,X83863_at,793,A,782,A,1138,A,627,A,...,279,A,737,A,588,A,1170,A,2315,A
7125,HMG2 High-mobility group (nonhistone chromosom...,Z17240_at,329,A,295,A,777,P,170,A,...,51,A,227,A,361,A,284,A,250,A
7126,RB1 Retinoblastoma 1 (including osteosarcoma),L49218_f_at,36,A,11,A,41,A,-50,A,...,6,A,-9,A,-26,A,39,A,-12,A
7127,GB DEF = Glycophorin Sta (type A) exons 3 and ...,M71243_f_at,191,A,76,A,228,A,126,A,...,2484,P,371,A,133,A,298,A,790,P
7128,GB DEF = mRNA (clone 1A7),Z78285_f_at,-37,A,-14,A,-41,A,-91,A,...,-2,A,-31,A,-32,A,-3,A,-10,A


In [9]:
df_train.columns

NameError: name 'genetics_df' is not defined

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   patient  72 non-null     int64 
 1   cancer   72 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.2+ KB


In [None]:
df_train.isnull()

In [None]:
df_train.isnull().sum()

## Replace 0s with NaN

In [None]:
diabetes_copy = diabetes_df.copy(deep=True)
diabetes_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diabetes_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
diabetes_copy.isnull().sum()

## Replace NaNs with Medians

In [None]:
diabetes_copy['Glucose'].fillna(diabetes_copy['Glucose'].median(), inplace = True)
diabetes_copy['BloodPressure'].fillna(diabetes_copy['BloodPressure'].median(), inplace = True)
diabetes_copy['SkinThickness'].fillna(diabetes_copy['SkinThickness'].median(), inplace = True)
diabetes_copy['Insulin'].fillna(diabetes_copy['Insulin'].median(), inplace = True)
diabetes_copy['BMI'].fillna(diabetes_copy['BMI'].median(), inplace = True)

## Visualize as Boxplot

In [None]:
count = 0
for i in range(diabetes_copy.columns.size):
    plt.figure(i)
    sns.boxplot(x=diabetes_copy[diabetes_copy.columns[i]])

# Remove Outliers Using Z Scores

In [None]:
import scipy.stats as stats

z = np.abs(stats.zscore(diabetes_copy))
data_clean = diabetes_copy[(z<3).all(axis=1)]
data_clean.shape

In [None]:
data_clean

# Data Correlation

In [None]:
sns.heatmap(data_clean.corr(), fmt='.2g', annot=True)

In [None]:
#Counting 1 and 0 Value in Outcome column
color_wheel = {1: "#0392cf", 2: "#7bc043"}
colors = data_clean["Outcome"].map(lambda x: color_wheel.get(x + 1))
print(data_clean.Outcome.value_counts())
p=data_clean.Outcome.value_counts().plot(kind="bar")

In [None]:
sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(diabetes_copy.drop(["Outcome"],axis = 1),), columns=['Pregnancies', 
'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])
X.head()

# Build Machine Learning Model

In [None]:
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

In [None]:
#test size 20% and train size 80%
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=7)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)

In [None]:
rfc_train = rfc.predict(X_train)
from sklearn import metrics

print("Accuracy_Score =", format(metrics.accuracy_score(y_train, rfc_train)))

In [None]:
from sklearn import metrics

predictions = rfc.predict(X_test)
print("Accuracy_Score =", format(metrics.accuracy_score(y_test, predictions)))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

In [None]:
y_pred = dtree.predict(X_test)
print("Classification report - \n", classification_report(y_test,y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,5))
sns.heatmap(data=cm,linewidths=.5, annot=True,square = True,  cmap = 'Blues')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(dtree.score(X_test, y_test))
plt.title(all_sample_title, size = 15)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
y_pred_proba = dtree.predict_proba(X_test)[:][:,1]

df_actual_predicted = pd.concat([pd.DataFrame(np.array(y_test), columns=['y_actual']), pd.DataFrame(y_pred_proba, columns=['y_pred_proba'])], axis=1)
df_actual_predicted.index = y_test.index

fpr, tpr, tr = roc_curve(df_actual_predicted['y_actual'], df_actual_predicted['y_pred_proba'])
auc = roc_auc_score(df_actual_predicted['y_actual'], df_actual_predicted['y_pred_proba'])

plt.plot(fpr, tpr, label='AUC = %0.4f' %auc)
plt.plot(fpr, fpr, linestyle = '--', color='k')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve', size = 15)
plt.legend()

# XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(gamma=0)
xgb_model.fit(X_train, y_train)

In [None]:
from sklearn import metrics

xgb_pred = xgb_model.predict(X_test)
print("Accuracy Score =", format(metrics.accuracy_score(y_test, xgb_pred)))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test,xgb_pred))

# Support Vector Machine

In [None]:
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(X_train, y_train)

In [None]:
svc_pred = svc_model.predict(X_test)

In [None]:
from sklearn import metrics

print("Accuracy Score =", format(metrics.accuracy_score(y_test, svc_pred)))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, svc_pred))
print(classification_report(y_test,svc_pred))

# Feature Importance

In [None]:
rfc.feature_importances_

In [None]:
(pd.Series(rfc.feature_importances_, index=X.columns).plot(kind='barh'))

# Saving Model

In [None]:
import pickle

# Firstly we will be using the dump() function to save the model using pickle
saved_model = pickle.dumps(rfc)

# Then we will be loading that saved model
rfc_from_pickle = pickle.loads(saved_model)

# lastly, after loading that model we will use this to make predictions
rfc_from_pickle.predict(X_test)

In [None]:
diabetes_df.head()

In [None]:
diabetes_df.tail()

In [None]:
rfc.predict([[0,137,40,35,168,43.1,2.228,33]]) #4th patient

In [None]:
rfc.predict([[10,101,76,48,180,32.9,0.171,63]])  # 763th patient