In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [None]:
dataset = pd.read_csv('Cancer_data.csv')

In [None]:
dataset.shape

(1500, 9)

In [None]:
dataset.head()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
0,58,1,16.085313,0,1,8.146251,4.148219,1,1
1,71,0,30.828784,0,1,9.36163,3.519683,0,0
2,48,1,38.785084,0,2,5.135179,4.728368,0,1
3,34,0,30.040296,0,0,9.502792,2.044636,0,0
4,62,1,35.479721,0,0,5.35689,3.309849,0,1


In [None]:
dataset.tail()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
1495,62,1,25.090025,0,0,9.892167,1.284158,0,1
1496,31,0,33.447125,0,1,1.668297,2.280636,1,1
1497,63,1,32.613861,1,1,0.466848,0.150101,0,1
1498,55,0,25.568216,0,0,7.795317,1.986138,1,1
1499,67,1,23.663104,0,0,2.52586,2.8566,1,0


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               1500 non-null   int64  
 1   Gender            1500 non-null   int64  
 2   BMI               1500 non-null   float64
 3   Smoking           1500 non-null   int64  
 4   GeneticRisk       1500 non-null   int64  
 5   PhysicalActivity  1500 non-null   float64
 6   AlcoholIntake     1500 non-null   float64
 7   CancerHistory     1500 non-null   int64  
 8   Diagnosis         1500 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 105.6 KB


In [None]:
dataset.isnull().sum()

Age                 0
Gender              0
BMI                 0
Smoking             0
GeneticRisk         0
PhysicalActivity    0
AlcoholIntake       0
CancerHistory       0
Diagnosis           0
dtype: int64

In [None]:
dataset.describe()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,50.32,0.490667,27.513321,0.269333,0.508667,4.897929,2.417987,0.144,0.371333
std,17.640968,0.50008,7.230012,0.443761,0.678895,2.866162,1.419318,0.351207,0.483322
min,20.0,0.0,15.000291,0.0,0.0,0.00241,0.001215,0.0,0.0
25%,35.0,0.0,21.483134,0.0,0.0,2.434609,1.210598,0.0,0.0
50%,51.0,0.0,27.598494,0.0,0.0,4.834316,2.382971,0.0,0.0
75%,66.0,1.0,33.850837,1.0,1.0,7.409896,3.585624,0.0,1.0
max,80.0,1.0,39.958688,1.0,2.0,9.994607,4.987115,1.0,1.0


In [None]:
dataset['Diagnosis'].value_counts()

Diagnosis
0    943
1    557
Name: count, dtype: int64

In [None]:
X = dataset.drop(columns = 'Diagnosis', axis = 1)
Y = dataset['Diagnosis']

In [None]:
print(X)

      Age  Gender        BMI  Smoking  GeneticRisk  PhysicalActivity  \
0      58       1  16.085313        0            1          8.146251   
1      71       0  30.828784        0            1          9.361630   
2      48       1  38.785084        0            2          5.135179   
3      34       0  30.040296        0            0          9.502792   
4      62       1  35.479721        0            0          5.356890   
...   ...     ...        ...      ...          ...               ...   
1495   62       1  25.090025        0            0          9.892167   
1496   31       0  33.447125        0            1          1.668297   
1497   63       1  32.613861        1            1          0.466848   
1498   55       0  25.568216        0            0          7.795317   
1499   67       1  23.663104        0            0          2.525860   

      AlcoholIntake  CancerHistory  
0          4.148219              1  
1          3.519683              0  
2          4.728368     

In [None]:
print(Y)

0       1
1       0
2       1
3       0
4       1
       ..
1495    1
1496    1
1497    1
1498    1
1499    0
Name: Diagnosis, Length: 1500, dtype: int64


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size = 0.2, stratify = Y, random_state = 0)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(1500, 8) (1200, 8) (300, 8)


In [None]:
regressor = LogisticRegression(max_iter=1000)
regressor.fit(X_train, Y_train)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = Y_train, cv = 10)
print(accuracies)
print(accuracies.mean())
print(accuracies.std())

[0.84166667 0.81666667 0.86666667 0.825      0.83333333 0.88333333
 0.825      0.86666667 0.825      0.85833333]
0.8441666666666666
0.02174664725116649


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C' : [0.25, 0.5, 0.75, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}]
grid_search = GridSearchCV(estimator = regressor, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, Y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

0.8474999999999999
{'C': 0.25, 'solver': 'liblinear'}


In [None]:
training_data_prediction = regressor.predict(X_train)
training_data_accuracy_score = accuracy_score(Y_train, training_data_prediction)
precision = precision_score(Y_train, training_data_prediction)
recall = recall_score(Y_train, training_data_prediction)
f1 = f1_score(Y_train, training_data_prediction)
roc_auc = roc_auc_score(Y_train, training_data_prediction)

print("Accuracy on training data:", training_data_accuracy_score)
print("Precision on training data:", precision)
print("Recall on training data:", recall)
print("F1 Score on training data:", f1)
print("ROC AUC Score on training data:", roc_auc)

In [None]:
testing_data_prediction = regressor.predict(X_test)
testing_data_accuracy_score = accuracy_score(Y_test, testing_data_prediction)
precision = precision_score(Y_test, testing_data_prediction)
recall = recall_score(Y_test, testing_data_prediction)
f1 = f1_score(Y_test, testing_data_prediction)
roc_auc = roc_auc_score(Y_test, testing_data_prediction)

print("Accuracy on testing data:", testing_data_accuracy_score)
print("Precision on testing data:", precision)
print("Recall on testing data:", recall)
print("F1 Score on testing data:", f1)
print("ROC AUC Score on testing data:", roc_auc)

Building a Predictive System

In [None]:
input_data = (71,0,30.828784389850558,0,1,9.361630415509964,3.519683335172577,0)
input_data_as_np = np.asarray(input_data)
input_data_reshaped = input_data_as_np.reshape(1, -1)
input_data_prediction = regressor.predict(input_data_reshaped)

if input_data_prediction[0] == 0: print("This person does not have Cancer")
else: print("This person has Cancer")