# Logistic Regression

In [41]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_validate, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn import svm

In [2]:
#Loading the iris data set
data = sns.load_dataset("iris")
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [7]:
# Checking the distribution of target variable
data["species"].value_counts()

versicolor    50
setosa        50
virginica     50
Name: species, dtype: int64

In [3]:
# splitting the dependent and independent variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [39]:
#Splitting the data into 75% training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 100)

X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

print("size of X_train is: ", X_train.shape)
print("size of X_test is: ", X_test.shape)
print("size of y_train is: ", y_train.shape)
print("size of y_test is: ", y_test.shape)

size of X_train is:  (112, 4)
size of X_test is:  (38, 4)
size of y_train is:  (112,)
size of y_test is:  (38,)


In [6]:
#Training the model
model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [8]:
# Predicting for the training set
y_train_pred = model.predict(X_train)

In [11]:
# Checking the evaluation metrics of the training set
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        36
  versicolor       0.95      0.95      0.95        40
   virginica       0.94      0.94      0.94        36

    accuracy                           0.96       112
   macro avg       0.96      0.96      0.96       112
weighted avg       0.96      0.96      0.96       112



In [14]:
confusion_matrix(y_train, y_train_pred)

array([[36,  0,  0],
       [ 0, 38,  2],
       [ 0,  2, 34]])

In [15]:
# Predicting for the testing set
y_test_pred = model.predict(X_test)

In [16]:
# Checking the evaluation metrics of the training set
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        14
  versicolor       0.90      0.90      0.90        10
   virginica       0.93      0.93      0.93        14

    accuracy                           0.95        38
   macro avg       0.94      0.94      0.94        38
weighted avg       0.95      0.95      0.95        38



In [17]:
confusion_matrix(y_test, y_test_pred)

array([[14,  0,  0],
       [ 0,  9,  1],
       [ 0,  1, 13]])

In [22]:
# Now instead of train and test set, using cross validation
model = LogisticRegression(max_iter = 1000)
scores = cross_validate(model, X, y, cv = 4, scoring = 'accuracy', return_train_score = True)
scores

{'fit_time': array([0.03507376, 0.02719998, 0.01495004, 0.01268101]),
 'score_time': array([0.00256991, 0.00101995, 0.00084591, 0.00072503]),
 'test_score': array([0.97368421, 0.97368421, 0.94594595, 1.        ]),
 'train_score': array([0.96428571, 0.97321429, 0.98230088, 0.96460177])}

In [23]:
# performing cross validation using Kfold
acc_score = []

cv = KFold(n_splits = 4, random_state = 100, shuffle = True)

for train_index , test_index in cv.split(X):

    X_train , X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]

    model = LogisticRegression(max_iter = 1000)
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(y_test, pred_values)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/4
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.9473684210526315, 0.9736842105263158, 0.918918918918919, 1.0]
Avg accuracy : 0.9599928876244666


# Support Vector Machine

In [32]:
# loading the cancer dataset
cancer = load_breast_cancer()
print("Features: ", cancer.feature_names)
print("Labels: ", cancer.target_names)

Features:  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Labels:  ['malignant' 'benign']


In [35]:
pd.DataFrame(cancer.data, columns = cancer.feature_names)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [36]:
pd.DataFrame(cancer.target, columns = ["target"])

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
564,0
565,0
566,0
567,0


In [37]:
# Checking the distribution of the target
pd.DataFrame(cancer.target, columns = ["target"]).value_counts()

target
1         357
0         212
dtype: int64

In [40]:
# Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(cancer.data), pd.Series(cancer.target), test_size = 0.3, random_state = 100)

X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

print("size of X_train is: ", X_train.shape)
print("size of X_test is: ", X_test.shape)
print("size of y_train is: ", y_train.shape)
print("size of y_test is: ", y_test.shape)

size of X_train is:  (398, 30)
size of X_test is:  (171, 30)
size of y_train is:  (398,)
size of y_test is:  (171,)


In [42]:
# Training a linear support vector machine
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [44]:
# Predicting for the training set
y_train_pred = clf.predict(X_train)

In [45]:
# Checking the evaluation metrics of the training set
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       143
           1       0.97      0.98      0.97       255

    accuracy                           0.97       398
   macro avg       0.97      0.96      0.96       398
weighted avg       0.97      0.97      0.97       398



In [46]:
confusion_matrix(y_train, y_train_pred)

array([[135,   8],
       [  5, 250]])

In [47]:
# Predicting for the testing set
y_test_pred = clf.predict(X_test)

In [48]:
# Checking the evaluation metrics of the testing set
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94        69
           1       0.95      0.97      0.96       102

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



In [49]:
confusion_matrix(y_test, y_test_pred)

array([[64,  5],
       [ 3, 99]])