# Homework # 1 - Logistic Regression
Data file: breast_cancer_diagnosis.csv

### Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### Load data

In [2]:
# Read data from file (breast_cancer_diagnosis.csv) into dataframe
df = pd.read_csv('breast_cancer_diagnosis.csv')

### Examine data

In [3]:
# Review dataframe shape
df.shape

(569, 13)

In [4]:
# Display first 10 rows
df.head(10)

Unnamed: 0,id,name,radius,texture,perimeter,area,smoothness,compactness,concavity,symmetry,fractal_dimension,age,diagnosis
0,ID842302,Glynnis Munson,,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871,35,1
1,ID842517,Lana Behrer,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667,27,1
2,ID84300903,Devondra Vanvalkenburgh,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999,31,1
3,ID84348301,Glory Maravalle,,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744,49,1
4,ID84358402,Mellie Mccurdy,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883,20,1
5,ID843786,Merle Yelon,,15.7,82.57,477.1,0.1278,0.17,0.1578,0.2087,0.07613,39,1
6,ID844359,Corrianne Banzett,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.1794,0.05742,38,1
7,ID84458202,Noni Marcellino,,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.2196,0.07451,28,1
8,ID844981,Kacy Meltzer,,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.235,0.07389,50,1
9,ID84501001,Elka Ortolani,,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.203,0.08243,60,1


Note: NaN in radius column

### Prepare data

### Check for missing values

In [5]:
df.isnull().sum()

id                    0
name                  0
radius               71
texture               0
perimeter             0
area                  0
smoothness            0
compactness           0
concavity             0
symmetry              0
fractal_dimension     0
age                   0
diagnosis             0
dtype: int64

### Handle missing values
Remember to use "inplace=True"

In [6]:
# Figure out strategy to replace values where radius is NaN
average_radius = df['radius'].mean()
average_radius

14.32663453815262

In [7]:
# Replace values where radius is NaN based on above strategy
df['radius'].fillna(average_radius, inplace = True)
df.head()

Unnamed: 0,id,name,radius,texture,perimeter,area,smoothness,compactness,concavity,symmetry,fractal_dimension,age,diagnosis
0,ID842302,Glynnis Munson,14.326635,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871,35,1
1,ID842517,Lana Behrer,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667,27,1
2,ID84300903,Devondra Vanvalkenburgh,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999,31,1
3,ID84348301,Glory Maravalle,14.326635,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744,49,1
4,ID84358402,Mellie Mccurdy,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883,20,1


### Check for missing values again

In [8]:
df.isnull().sum()

id                   0
name                 0
radius               0
texture              0
perimeter            0
area                 0
smoothness           0
compactness          0
concavity            0
symmetry             0
fractal_dimension    0
age                  0
diagnosis            0
dtype: int64

### Drop non-numeric variables
Remember to use "inplace=True"

In [9]:
df.dtypes

id                    object
name                  object
radius               float64
texture              float64
perimeter            float64
area                 float64
smoothness           float64
compactness          float64
concavity            float64
symmetry             float64
fractal_dimension    float64
age                    int64
diagnosis              int64
dtype: object

In [10]:
df.drop(columns = ['id', 'name'], inplace = True)

### Review updated dataframe

In [11]:
# Display first 10 rows
df.head(10)

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,concavity,symmetry,fractal_dimension,age,diagnosis
0,14.326635,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.2419,0.07871,35,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.1812,0.05667,27,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.2069,0.05999,31,1
3,14.326635,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.2597,0.09744,49,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1809,0.05883,20,1
5,14.326635,15.7,82.57,477.1,0.1278,0.17,0.1578,0.2087,0.07613,39,1
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.1794,0.05742,38,1
7,14.326635,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.2196,0.07451,28,1
8,14.326635,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.235,0.07389,50,1
9,14.326635,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.203,0.08243,60,1


### Separate independent and dependent variables
* Independent variables: All remaining variables except Diagnosis
* Dependent variable: Diagnosis

In [12]:
# Prepare dataset for model training
X = df.drop(columns = 'diagnosis')
y = df['diagnosis']

### Split data into training and test sets

In [13]:
# Split data into training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Train model

In [14]:
# Train model
model = LogisticRegression(max_iter = 150)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=150)

### If above results in error, review error message, look up documentation for LogisticRegression, and change model hyperparameter appropriately

In [15]:
# Train model again
model = LogisticRegression(max_iter = 150, solver = 'newton-cg')
model.fit(X_train, y_train)

LogisticRegression(max_iter=150, solver='newton-cg')

### Test model

In [16]:
# Generate predictions against the test set
predictions = model.predict(X_test)
predictions

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0])

### Model evaluation

In [17]:
# Print classification report
pd.DataFrame(classification_report(y_test, predictions, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.927273,0.918033,0.923977,0.922653,0.923815
recall,0.953271,0.875,0.923977,0.914136,0.923977
f1-score,0.940092,0.896,0.923977,0.918046,0.92359
support,107.0,64.0,0.923977,171.0,171.0


In [18]:
# Print model accuracy
accuracy_score(y_test, predictions)

0.9239766081871345