# **Logistic Regression Classifier on Wisconsin Breast Cancer Dataset**

In [0]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [0]:
data = pd.read_csv("Breast_Cancer.csv",header=0)

In [0]:
data.head(10)

# **We wish to predict the diagonsis, get rid of unnamed and id columns**

In [0]:
data.drop("Unnamed: 32",axis=1,inplace=True)
data.drop("id",axis=1,inplace=True)

#**Check if we have missing data**

In [0]:
Missing_data=data.isna().sum()
Missing_data=Missing_data.to_frame()
Missing_data.columns=['Missing Values']
Missing_data.sort_values(by='Missing Values',ascending=False)

#**Convert Malignant Class 'M' to 1 and Benign Class 'B' to 0**
#**Visualize the class Distributions**

In [0]:
data['diagnosis']=data['diagnosis'].map({'M':1,'B':0})
Output = data['diagnosis']


In [0]:
plt.figure(figsize=(12,4), dpi=100)
sns.countplot(data['diagnosis'],label="Count")


In [0]:
data.drop("diagnosis",axis=1,inplace=True)


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data,Output,test_size=0.25,random_state=2)

In [0]:
print(X_train.shape)
print(X_test.shape)

#**Always scale the data for modles that calculate weights**

In [0]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics


## **Train Logistic Regression Model**

In [0]:
model_logit= LogisticRegression(solver='liblinear',max_iter=5000,random_state=2)


In [0]:
model_logit.fit(X_train,Y_train)


In [0]:
prediction_logit_train=model_logit.predict(X_train)

#**Get training accuracy**

In [0]:
metrics.accuracy_score(prediction_logit_train,Y_train)


In [0]:
cm_train=confusion_matrix(Y_train, prediction_logit_train)
cm_train

# **Cross Validation**

In [0]:
from sklearn.model_selection import cross_val_score
accu_logit=cross_val_score(estimator=model_logit,X=X_train,y=Y_train,cv=10)
print("The mean accuracy is:", accu_logit.mean())
print("The standard deviation of the accuracy is:", accu_logit.std())

# **Get Testing Accuracy**

#**Get the testing confusion matrix**

In [0]:
prediction_logit=model_logit.predict(X_test)
cm_test=confusion_matrix(Y_test, prediction_logit)

In [0]:
metrics.accuracy_score(prediction_logit,Y_test)


# **Let's tune the C parameter in logistic regression. Lower values represent strong regularization and higher values for low regularization. C=1 by default in the logistic regression** 

In [0]:
from sklearn.model_selection import GridSearchCV


In [0]:
parameters_logit= [{'C':[0.01,0.1,0.2,0.5,1,10],'solver':['liblinear','saga','lbfgs']} ]


In [0]:
grid_search_logit=GridSearchCV(estimator=model_logit, param_grid=parameters_logit,scoring='accuracy',cv=10)

In [0]:
grid_search_logit.fit(X_train,Y_train)

In [32]:
print("The best validation score achived was: ",grid_search_logit.best_score_)

The best validation score achived was:  0.9835680751173709


In [0]:
print("The above validation score was achieved under the parameters:",grid_search_logit.best_params_)

In [0]:
prediction_logit_tuned=grid_search_logit.predict(X_test)
print("Testing accuracy after tuning" ,metrics.accuracy_score(prediction_logit_tuned,Y_test))