# logistic Regression

In [119]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 


In [4]:
X.columns

Index(['Clump_thickness', 'Uniformity_of_cell_size',
       'Uniformity_of_cell_shape', 'Marginal_adhesion',
       'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin',
       'Normal_nucleoli', 'Mitoses'],
      dtype='object')

In [5]:
y['Class'].unique()

array([2, 4], dtype=int64)

In [6]:
y['Class'] = np.where(y['Class'] == 2,0,1)
y['Class'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Class'] = np.where(y['Class'] == 2,0,1)


Class
0    458
1    241
Name: count, dtype: int64

In [7]:
X.drop('Bare_nuclei',axis=1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('Bare_nuclei',axis=1, inplace = True)


In [8]:
X.isna().sum()

Clump_thickness                0
Uniformity_of_cell_size        0
Uniformity_of_cell_shape       0
Marginal_adhesion              0
Single_epithelial_cell_size    0
Bland_chromatin                0
Normal_nucleoli                0
Mitoses                        0
dtype: int64

In [9]:
lr = LogisticRegression()
lr.fit(X,y)
lr.coef_

  y = column_or_1d(y, warn=True)


array([[0.56914683, 0.00794454, 0.54752079, 0.30860388, 0.13050298,
        0.5656214 , 0.12514389, 0.54475134]])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size= 0.3)

In [11]:
lr= LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
#pd.crosstab(y_test['Class'].values,y_pred)

  y = column_or_1d(y, warn=True)


In [12]:
print(confusion_matrix(y_test['Class'],y_pred))

[[135   3]
 [  4  68]]


In [13]:
accuracy_score(y_test['Class'],y_pred)

0.9666666666666667

In [14]:
print(y['Class'].value_counts(),'\n')
print(y['Class'].value_counts(normalize=True),'\n')
# print(y_test['Class'].value_counts(),'\n')
print(y_test['Class'].value_counts(),'\n')

Class
0    458
1    241
Name: count, dtype: int64 

Class
0    0.655222
1    0.344778
Name: proportion, dtype: float64 

Class
0    138
1     72
Name: count, dtype: int64 



#### Naive Rule /Baseline Model
-  Any ML Model score should better than score of the Naive/ baseline model.
-  Here, baseline(naive) model score = 0.6571428571428571 & ML Model score = 0.9666666666666667.
-  Hence we can say that this ML Model can be a relevant model. However if it is sufficient/enough or not, it would depend on the business or necessity.

In [16]:
y_pred = np.zeros(210)
y_test['Class'].value_counts()

Class
0    138
1     72
Name: count, dtype: int64

In [17]:
accuracy_score(y_test['Class'],y_pred)

0.6571428571428571

## another dataset

In [19]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
# metadata 
# print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# # variable information 
# print(breast_cancer_wisconsin_diagnostic.variables) 

In [20]:
# y = np.where(y['Diagnosis']=='M',1 ,0)
lbl = LabelEncoder()
y.Diagnosis = lbl.fit_transform(y.Diagnosis)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.Diagnosis = lbl.fit_transform(y.Diagnosis)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size= 0.3)

In [22]:
X.shape

(569, 30)

In [23]:
lr= LogisticRegression(solver='saga', max_iter = 600)
lr.fit(X_train, y_train.Diagnosis)




In [24]:
y_pred = lr.predict(X_test)

In [25]:
print(confusion_matrix(y_test['Diagnosis'],y_pred))

[[104   2]
 [  9  56]]


In [26]:
accuracy_score(y_test['Diagnosis'],y_pred)

0.935672514619883

In [27]:
101/171 

0.5906432748538012

## iris DataSEt

In [29]:
# from ucimlrepo import fetch_ucirepo 
  
# # fetch dataset 
# iris = fetch_ucirepo(id=53) 
# X = iris.data.features 
# y = iris.data.targets 

df = pd.read_csv('Datasets/iris.csv')
y = df["Species"]
X = df.drop("Species", axis = 1)

In [30]:
X.head()


Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [31]:
y1 = lbl.fit_transform(y)

In [32]:
# y['class'] = y1

In [33]:
X_train, X_test,  y_train, y_test = train_test_split(X,y1,random_state = 24, test_size = .3 )

In [34]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [35]:
y_pred = lr.predict(X_test)

In [36]:
accuracy_score(y_test, y_pred)

0.9777777777777777

In [37]:
confusion_matrix(y_test, y_pred)

array([[15,  0,  0],
       [ 0, 11,  1],
       [ 0,  0, 18]], dtype=int64)

In [45]:
18+11+15+1

45

In [47]:
18/45

0.4

##### new dataset

In [80]:
df = pd.read_csv('Datasets/Default.csv')
df['default'].value_counts()

default
No     9667
Yes     333
Name: count, dtype: int64

In [63]:
y = df["default"]
X = df.drop("default", axis = 1)

In [65]:
y = lbl.fit_transform(y)

In [67]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [71]:
X["student"] = lbl.fit_transform(X["student"])

In [73]:
X_train, X_test,  y_train, y_test = train_test_split(X,y,random_state = 24, test_size = .3 )
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.97

In [104]:
x = confusion_matrix(y_test, y_pred)
x

array([[2885,   16],
       [  74,   25]], dtype=int64)

In [106]:
# 2885/(2885 + 16 + 74 + 25)
x.max()/x.sum()
# y_pred_nv = np.zeros(y_test.shape[0])

0.9616666666666667

### KFold CV

In [125]:
pipe = Pipeline([('LR', lr)])

In [127]:
results = cross_val_score(pipe, X, y)
results.mean()

0.9731

### wine dataset

In [132]:
df = pd.read_csv("Datasets/wine.csv")

In [138]:
df.head()

Unnamed: 0,Class,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoid,Proanthocyanins,Intensity,Hue,OD280,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [140]:
y = df["Class"]
X = df.drop("Class", axis = 1)

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 24, test_size = .3)

In [156]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9074074074074074

In [158]:
results = cross_val_score(pipe, X, y)
results.mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.9555555555555555