In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
def machine_learning(data,algorithm):
    #reading dataset
    df = pd.read_csv(data)
    print("checking skewness of dataset...")
    print(df.skew())
    
    y =df.Survival.values
    x1=df.drop(["Survival"],axis=1)
    x = (x1 - np.min(x1))/(np.max(x1)-np.min(x1)).values
    kf = KFold(n_splits=10, random_state=None)
    #Split For Train and Test
    for train_index , test_index in kf.split(x):
        xtrain , xtest = x.iloc[train_index,:],x.iloc[test_index,:]
        ytrain , ytest = y[train_index] , y[test_index]
    #---------------------------------------------------------------
    if algorithm=="logistic_regression":
        # instantiate the model (using the default parameters)
        logreg = LogisticRegression()
        # fit the model with data
        logreg.fit(xtrain,ytrain)
        y_pred=logreg.predict(xtest)
        cnf_matrix = metrics.confusion_matrix(ytest, y_pred)
        
        where_0 = np.where(ytest == 1)
        where_1 = np.where(ytest == 2)

        ytest[where_0] = 0
        ytest[where_1] = 1
        r_probs = np.array([0 for _ in range(len(ytest))])
        rf_probs = logreg.predict_proba(xtest)
        rf_probs = rf_probs[:, 1]
    
        r_auc = roc_auc_score(ytest, r_probs)
        rf_auc = roc_auc_score(ytest, rf_probs)
        r_fpr, r_tpr, _ = roc_curve(ytest, r_probs)
        rf_fpr, rf_tpr, _ = roc_curve(ytest, rf_probs)
        plt.plot(r_fpr, r_tpr, linestyle='--', label='Logistic Regression Predict(AUROC = %0.3f)' % r_auc)
        plt.plot(rf_fpr, rf_tpr, marker='.', label='Logistic Regression (AUROC = %0.3f)' % rf_auc)
        # Title
        plt.title('ROC Plot')
        # Axis labels
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        # Show legend
        plt.legend() # 
        # Show plot
        plt.show()
        return metrics.confusion_matrix(ytest, y_pred)
    #---------------------------------------------------------------
    elif algorithm=="random_forest":
        # instantiate the model (using the default parameters)
        logreg = RandomForestClassifier(n_estimators=20, random_state=0)
        # fit the model with data
        logreg.fit(xtrain,ytrain)
        y_pred=logreg.predict(xtest)

        where_0 = np.where(ytest == 1)
        where_1 = np.where(ytest == 2)

        ytest[where_0] = 0
        ytest[where_1] = 1
        r_probs = np.array([0 for _ in range(len(ytest))])
        rf_probs = logreg.predict_proba(xtest)
        rf_probs = rf_probs[:, 1]
    
        r_auc = roc_auc_score(ytest, r_probs)
        rf_auc = roc_auc_score(ytest, rf_probs)
        r_fpr, r_tpr, _ = roc_curve(ytest, r_probs)
        rf_fpr, rf_tpr, _ = roc_curve(ytest, rf_probs)
        plt.plot(r_fpr, r_tpr, linestyle='--', label='Random prediction (AUROC = %0.3f)' % r_auc)
        plt.plot(rf_fpr, rf_tpr, marker='.', label='Random Forest (AUROC = %0.3f)' % rf_auc)
        # Title
        plt.title('ROC Plot')
        # Axis labels
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        # Show legend
        plt.legend() # 
        # Show plot
        plt.show()
        return metrics.confusion_matrix(ytest, np.round(abs(y_pred)))
    #---------------------------------------------------------------
    elif algorithm=="support_vector_machine":
        clf = SVC(probability=True)
        clf.fit(xtrain,ytrain)
        y_pred=clf.predict(xtest)

        where_0 = np.where(ytest == 1)
        where_1 = np.where(ytest == 2)

        ytest[where_0] = 0
        ytest[where_1] = 1
        r_probs = np.array([0 for _ in range(len(ytest))])
        rf_probs = clf.predict_proba(xtest)
        rf_probs = rf_probs[:, 1]
        r_auc = roc_auc_score(ytest, r_probs)
        rf_auc = roc_auc_score(ytest, rf_probs)
        r_fpr, r_tpr, _ = roc_curve(ytest, r_probs)
        rf_fpr, rf_tpr, _ = roc_curve(ytest, rf_probs)
        plt.plot(r_fpr, r_tpr, linestyle='--', label='SVM prediction (AUROC = %0.3f)' % r_auc)
        plt.plot(rf_fpr, rf_tpr, marker='.', label='SVM (AUROC = %0.3f)' % rf_auc)
        # Title
        plt.title('ROC Plot')
        # Axis labels
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        # Show legend
        plt.legend() # 
        # Show plot
        plt.show()
        return metrics.confusion_matrix(ytest, y_pred)
    #---------------------------------------------------------------
    elif algorithm=="neural_network":
        model_1 = tf.keras.Sequential([
                   tf.keras.layers.Dense(3, activation='relu'),
                   tf.keras.layers.Dense(3, activation='relu'), 
                   tf.keras.layers.Dense(4, activation='relu'), 
                   tf.keras.layers.Dense(2, activation='softmax')
        ])

        model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                         optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                         metrics=['accuracy'])

        history = model_1.fit(xtrain, 
                              tf.one_hot(ytrain, depth=2), 
                              epochs=250,
                              verbose = 1,
                              validation_data=(xtest, tf.one_hot(ytest, depth=2)))

        _, acc = model_1.evaluate(xtest, tf.one_hot(ytest, depth=2))
        print(acc)

        where_0 = np.where(ytest == 1)
        where_1 = np.where(ytest == 2)

        ytest[where_0] = 0
        ytest[where_1] = 1
        r_probs = np.array([0 for _ in range(len(ytest))])
        rf_probs = model_1.predict(xtest)[:,-1]

        
        r_auc = roc_auc_score(ytest, r_probs)
        rf_auc = roc_auc_score(ytest, rf_probs)
        r_fpr, r_tpr, _ = roc_curve(ytest, r_probs)
        rf_fpr, rf_tpr, _ = roc_curve(ytest, rf_probs)
        plt.plot(r_fpr, r_tpr, linestyle='--', label='NN prediction (AUROC = %0.3f)' % r_auc)
        plt.plot(rf_fpr, rf_tpr, marker='.', label='NN (AUROC = %0.3f)' % rf_auc)
        # Title
        plt.title('ROC Plot')
        # Axis labels
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        # Show legend
        plt.legend() # 
        # Show plot
        plt.show()
        return model_1.predict(xtest)
 

In [3]:
print(machine_learning("cancer.csv","logistic_regression"))
print(machine_learning("cancer.csv","random_forest"))
print(machine_learning("cancer.csv","support_vector_machine"))
print(machine_learning("cancer.csv","neural_network"))


FileNotFoundError: [Errno 2] No such file or directory: 'cancer.csv'