# CS189 Final Project

Members:

Timothy Quang Nguyen, timotqn2@uci.edu

Aditeji (Fill), (Fill Email)

## Environment Setup

In [3]:
# For package installations
!pip install xgboost
!pip install scikit-optimize
!pip install pandas
!pip install numpy
!pip install seaborn
!pip install matplotlib



In [4]:
# Task: Predict cancer in patients given only data on their lung nodules

import numpy as np # for data organization
import pandas as pd # for data organization

import sklearn as sk # for machine learning
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score # For Evaluation
from sklearn.preprocessing import MinMaxScaler # To Scale Features
from sklearn.preprocessing import OneHotEncoder # To Encode Non-Numeric Columns
from sklearn.model_selection import GridSearchCV, train_test_split # For Hyper Parameter Tuning and Data Splitting
from skopt import BayesSearchCV # For Smart Hyper Parameter Tuning

from sklearn.linear_model import LogisticRegression # For Logistic Regression
from sklearn.ensemble import RandomForestClassifier # For Random Forest Classifier
from sklearn.svm import SVC # For Support Vector Classifier
from xgboost import XGBClassifier # For XGB Classifier

import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for plotting
import zipfile as zf # for unzipping data

In [5]:
# Code to extract dataset from zip (Don't need to run)

files = zf.ZipFile('archive.zip', 'r')
files.extractall('dataset')
files.close()

In [6]:
# Convert csv files into pandas dataframes

train_data = pd.read_csv('./dataset/Features_Train.csv')
test_data = pd.read_csv('./dataset/Lung Nodule/Lung Nodule/Features_Test.csv')

In [7]:
train_data

Unnamed: 0,ID,Annulus_N.voxels,Annulus_SNS_vol,Annulus_SNS_area,Annulus_SNS_s2v,Annulus_SNS_sph,Annulus_SNS_sph_dis,Annulus_SNS_com_1,Annulus_SNS_com_2,Annulus_SNS_max3d,...,Lesion_GLCM_sumEnt_HHH_25HUgl,Lesion_GLCM_difVar_HHH_25HUgl,Lesion_GLCM_difEnt_HHH_25HUgl,Lesion_GLCM_AutoCorrel_HHH_25HUgl,Lesion_GLCM_ClTend_HHH_25HUgl,Lesion_GLCM_Homoge1_HHH_25HUgl,Lesion_GLCM_IDMN_HHH_25HUgl,Lesion_GLCM_IDN_HHH_25HUgl,Lesion_GLCM_invVar_HHH_25HUgl,Outcome
0,0001_1,1504,1504,3364,2.236702,0.188709,5.299155,3.779511,0.006720,21,...,1.456343,0.516489,1.478822,12.359264,1.122325,0.672912,0.972564,0.898164,0.519051,0
1,0003_1,1007,1007,2224,2.208540,0.218459,4.577528,3.334505,0.010426,16,...,1.462379,0.535239,1.502895,12.134616,1.132018,0.675078,0.971933,0.898085,0.502191,0
2,0004_1,2160,2160,4322,2.000926,0.186969,5.348493,4.592921,0.006536,17,...,1.050196,0.245968,0.988000,2.261923,0.527141,0.763604,0.905442,0.842403,0.472791,1
3,0005_1,1568,1568,3276,2.089286,0.199237,5.019137,4.010592,0.007909,23,...,1.157852,0.299988,1.146312,6.241820,0.605958,0.733500,0.965438,0.891646,0.504706,1
4,0006_1,779,779,1702,2.184852,0.240556,4.157036,3.083113,0.013920,14,...,1.414994,0.484908,1.434273,6.282063,1.046868,0.692436,0.964913,0.889447,0.487179,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,0615_1,556,556,1124,2.021583,0.290916,3.437418,2.901721,0.024621,12,...,1.034586,0.239848,0.970242,2.343671,0.523414,0.763506,0.905402,0.842337,0.472988,0
605,0629_1,1087,1087,2036,1.873045,0.251107,3.982361,3.817708,0.015834,15,...,1.156824,0.303016,1.149835,6.254821,0.622207,0.741585,0.966023,0.894674,0.484972,0
606,0639_2,492,492,1056,2.146341,0.285406,3.503776,2.676790,0.023248,14,...,2.556892,7.991275,2.973143,210.145931,15.214457,0.405601,0.979103,0.909242,0.302455,0
607,0646_1,1476,1476,2874,1.947154,0.218133,4.584357,4.119586,0.010379,21,...,1.183595,0.317478,1.182046,6.145671,0.634903,0.726693,0.963519,0.888337,0.509707,1


In [8]:
test_data

Unnamed: 0,ID,Annulus_N voxels,Annulus_SNS_vol,Annulus_SNS_area,Annulus_SNS_s2v,Annulus_SNS_sph,Annulus_SNS_sph_dis,Annulus_SNS_com_1,Annulus_SNS_com_2,Annulus_SNS_max3d,...,Lesion_GLCM_sumEnt_HHH_25HUgl,Lesion_GLCM_difVar_HHH_25HUgl,Lesion_GLCM_difEnt_HHH_25HUgl,Lesion_GLCM_AutoCorrel_HHH_25HUgl,Lesion_GLCM_ClTend_HHH_25HUgl,Lesion_GLCM_Homoge1_HHH_25HUgl,Lesion_GLCM_IDMN_HHH_25HUgl,Lesion_GLCM_IDN_HHH_25HUgl,Lesion_GLCM_invVar_HHH_25HUgl,Outcome
0,0454_1,2297,2297,4624,2.013061,0.182071,5.492369,4.669182,0.006036,23,...,1.039347,0.245617,0.987064,2.250009,0.512441,0.756273,0.902509,0.837515,0.487455,1
1,0455_1,1655,1655,3370,2.036254,0.200780,4.980586,4.154032,0.008094,22,...,1.044802,0.244690,0.984586,2.256110,0.522768,0.761412,0.904565,0.840941,0.477177,0
2,0455_2,1960,1960,4172,2.128571,0.181542,5.508366,4.266957,0.005983,22,...,1.056757,0.251109,1.008803,2.263776,0.522695,0.756632,0.951032,0.878203,0.484911,0
3,0455_3,1782,1782,3788,2.125701,0.187649,5.329102,4.137387,0.006608,21,...,1.047968,0.249339,0.998092,2.238642,0.513910,0.757087,0.902835,0.838058,0.485826,0
4,0456_1,600,600,1322,2.203333,0.260228,3.842789,2.810317,0.017622,15,...,1.310203,0.423329,1.351448,12.136032,0.850684,0.704249,0.969073,0.895660,0.499861,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,0669_1,549,549,1200,2.185792,0.270199,3.700970,2.742900,0.019727,12,...,1.049768,0.243226,0.980279,2.258705,0.534523,0.767546,0.907018,0.845031,0.464908,0
123,0670_1,987,987,2126,2.154002,0.225493,4.434735,3.367958,0.011466,19,...,1.055271,0.246555,0.990005,2.280151,0.532569,0.766461,0.906585,0.844308,0.467077,0
124,0671_1,415,415,912,2.197590,0.295022,3.389576,2.489678,0.025678,11,...,1.009543,0.235130,0.956455,2.056056,0.507207,0.764303,0.905721,0.842868,0.471395,0
125,0671_2,665,665,1420,2.135338,0.259464,3.854101,2.969759,0.017467,14,...,1.061595,0.244005,0.982547,2.184166,0.550580,0.777429,0.910971,0.851619,0.445143,0


## Function Declaration

In [10]:
# Used to produce confusion matrices and accuracy metrics for models
def confusion_matrix_and_metrics(y_te, y_pred, label_nums, label_names):
    
    # Used to produce a pretty Confusion Matrix
    cm = confusion_matrix(y_te, y_pred, labels=label_nums)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)
    
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.show()

    # Used to produce metric data, precision, recall, F1 score, accuracy

    print("Precision of the MLP :\t" + str(precision_score(y_te, y_pred, average="macro")))
    print("Recall of the MLP    :\t" + str(recall_score(y_te, y_pred, average="macro")))
    print("F1 Score of the Model :\t" + str(f1_score(y_te, y_pred, average="macro")))
    print("Accuracy Score of the Model :\t" + str(round(accuracy_score(y_te, y_pred) * 100, 2)) + "%")

In [18]:
# Search for all column names that don't match
def non_matching_columns(data_one, data_two):
    non_matching = [] # list that will consist of tuples of mismatched columns

    data_one_cols = data_one.columns
    data_two_cols = data_two.columns
    
    for index in range(len(data_one_cols)):
        if data_one_cols[index] != data_two_cols[index]:
            non_matching.append((data_one_cols[index], data_two_cols[index]))

    return non_matching

In [34]:
# To showcase general data properties
def print_general_data_properties(data):
    rows, cols = data.shape

    print("Number of rows: ", rows)
    print("Number of columns: ", cols)
    print()
    print("All Columns:\n", data.columns)
    print()
    print("Column Types:\n", data.dtypes)

## Data Exploration

In [21]:
# Check for column name inconsistency between the training and test data
non_matching = non_matching_columns(train_data, test_data)

print("Number of Non-Matching Columns: ", len(non_matching))
print("Non-Matching Columns:\n", non_matching)

Number of Non-Matching Columns:  30
Non-Matching Columns:
 [('Annulus_N.voxels', 'Annulus_N voxels'), ('Annulus_AUC.CSH', 'Annulus_AUC-CSH'), ('Annulus_AUC.CSH_LLL', 'Annulus_AUC-CSH_LLL'), ('Annulus_AUC.CSH_LLH', 'Annulus_AUC-CSH_LLH'), ('Annulus_AUC.CSH_LHL', 'Annulus_AUC-CSH_LHL'), ('Annulus_AUC.CSH_LHH', 'Annulus_AUC-CSH_LHH'), ('Annulus_AUC.CSH_HLL', 'Annulus_AUC-CSH_HLL'), ('Annulus_AUC.CSH_HLH', 'Annulus_AUC-CSH_HLH'), ('Annulus_AUC.CSH_HHL', 'Annulus_AUC-CSH_HHL'), ('Annulus_AUC.CSH_HHH', 'Annulus_AUC-CSH_HHH'), ('Background_N.voxels', 'Background_N voxels'), ('Background_AUC.CSH', 'Background_AUC-CSH'), ('Background_AUC.CSH_LLL', 'Background_AUC-CSH_LLL'), ('Background_AUC.CSH_LLH', 'Background_AUC-CSH_LLH'), ('Background_AUC.CSH_LHL', 'Background_AUC-CSH_LHL'), ('Background_AUC.CSH_LHH', 'Background_AUC-CSH_LHH'), ('Background_AUC.CSH_HLL', 'Background_AUC-CSH_HLL'), ('Background_AUC.CSH_HLH', 'Background_AUC-CSH_HLH'), ('Background_AUC.CSH_HHL', 'Background_AUC-CSH_HHL'), ('

In [35]:
# General data properties
print_general_data_properties(train_data)
print()
print_general_data_properties(test_data)

Number of rows:  609
Number of columns:  2000

All Columns:
 Index(['ID', 'Annulus_N.voxels', 'Annulus_SNS_vol', 'Annulus_SNS_area',
       'Annulus_SNS_s2v', 'Annulus_SNS_sph', 'Annulus_SNS_sph_dis',
       'Annulus_SNS_com_1', 'Annulus_SNS_com_2', 'Annulus_SNS_max3d',
       ...
       'Lesion_GLCM_sumEnt_HHH_25HUgl', 'Lesion_GLCM_difVar_HHH_25HUgl',
       'Lesion_GLCM_difEnt_HHH_25HUgl', 'Lesion_GLCM_AutoCorrel_HHH_25HUgl',
       'Lesion_GLCM_ClTend_HHH_25HUgl', 'Lesion_GLCM_Homoge1_HHH_25HUgl',
       'Lesion_GLCM_IDMN_HHH_25HUgl', 'Lesion_GLCM_IDN_HHH_25HUgl',
       'Lesion_GLCM_invVar_HHH_25HUgl', 'Outcome'],
      dtype='object', length=2000)

Column Types:
 ID                                 object
Annulus_N.voxels                    int64
Annulus_SNS_vol                     int64
Annulus_SNS_area                    int64
Annulus_SNS_s2v                   float64
                                   ...   
Lesion_GLCM_Homoge1_HHH_25HUgl    float64
Lesion_GLCM_IDMN_HHH_25HUgl  