In [None]:
# ws01
from sklearn.datasets import load_breast_cancer

from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

cancer = load_breast_cancer()

In [None]:
cancer.keys()

In [None]:
cancer.feature_names

In [None]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)

df['class'] = cancer.target
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 3))

ax1.set_title('mean radius')
ax1.boxplot(df['mean radius'])

ax2.set_title('mean texture')
ax2.boxplot(df['mean texture'])
plt.show()

In [None]:
df = df[df['mean radius'] < 25]
df = df[df['mean texture'] < 38]
df.shape

In [None]:
sns.set_style('whitegrid')

sns.scatterplot('mean radius','mean texture', data=df, hue='class', style='class', 
                alpha=0.9, edgecolor='w', s=80)

plt.grid(False)
plt.legend(loc='upper left',bbox_to_anchor=(1, 1)) 

plt.show()

### ws 02

In [None]:

df['mean ra'] = np.where(df['class']==0, df['mean radius'] + 7.5, df['mean radius'])
df['mean ra'].head()

In [None]:
sns.scatterplot('mean ra','mean texture', data=df, hue='class', style='class', 
                alpha=0.9, edgecolor='w', s=80)

plt.grid(False)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) 
plt.show()

In [None]:

sns.distplot(df['mean ra'][df['class']==0], label='0', color='b')
sns.distplot(df['mean ra'][df['class']==1], label='1', color='r')
plt.legend()
plt.grid(False)
plt.show()

In [None]:
columns = ['mean ra','mean texture']
X = df[columns]
y = df['class']
X.shape , y.shape , y.unique()


In [None]:
X[:3]

In [None]:
y[:3]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)

print("Score: {:.3f}".format(model.score(X_test, y_test))) 

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=['No','Yes']))
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
import scikitplot as skplot
skplot.metrics.plot_confusion_matrix(y_test, y_pred)
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

model = LogisticRegression(solver='lbfgs')

cvs = cross_val_score(model, X, y, cv=10) 
print('cross val scores {}'.format(cvs.round(2)))
print('Average (%) = {:.2f}' .format(cvs.mean() * 100))

In [None]:
model.fit(X_train, y_train)

y_pred_prob = model.predict_proba(X_test) # prob to be 1
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob[:,1])
y_pred_prob[:10].round(3)

In [None]:
y_pred_prob.shape

In [None]:
df_result = pd.DataFrame(y_test)
df_result['y_pred'] = y_pred
df_result['y_pred_prob0'] = y_pred_prob[:,0]
df_result['y_pred_prob1'] = y_pred_prob[:,1]

df_result.head(5).round(3)


In [None]:
df_result.groupby('class').count()

In [None]:
from sklearn.metrics import roc_curve, auc
%matplotlib inline
ll=[0,1]
ur=[0,1]

y_pred_prob = model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1])

plt.plot(fpr, tpr) #, label="AUC={:.2f}".format(a))
plt.plot(ll,ur, '--r' )   # color red
plt.title('ROC Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.show()

In [None]:
auc = auc(fpr, tpr)
auc

## ws03

In [None]:
df['mean ra'] = df['mean radius']
df['mean ra'].head(3)

### ws04

In [None]:
df['mean ra'] = np.where(df['class']==0, df['mean radius'] - 5, df['mean radius'])
df['mean ra'].head(3)

### ws05


In [None]:
idx = df[df['class'] == 1 ].index
df.drop(idx, axis=0, inplace=True)
print(df.shape)
df2 = df.copy()
df2['class'] = 1
df2.head()
df = df.append(df2, ignore_index=True)
df['mean ra'] = np.where(df['class']==0, df['mean radius'] +1 , df['mean radius'])
df.shape

In [None]:
plt.boxplot(df['mean texture']) ; plt.show()

In [None]:
sns.scatterplot('mean ra','mean texture', data=df, hue='class', style='class', 
                alpha=0.9, edgecolor='w', s=80)
plt.grid(False)
plt.legend(bbox_to_anchor=(1, 1)) 
plt.savefig('case1',dpi=120)
plt.show()

# Threshold

In [None]:
# ws06
df['mean ra'] = df['mean radius']
df['mean ra'].head(3)

In [None]:

sns.scatterplot('mean ra','mean texture', data=df, hue='class', style='class', 
                alpha=0.9, edgecolor='w', s=80)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1)) 
plt.show()

In [None]:
columns = ['mean ra','mean texture']
X = df[columns]
y = df['class']
X.shape , y.shape , y.unique()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)


threshold = .34
# threshold = .98
y_pred_prob = model.predict_proba(X_test)[:,1] 
y_pred = y_pred_prob > threshold

print(classification_report(y_test, y_pred, target_names=['No','Yes']))

cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
import scikitplot as skplot
sns.set(rc={'figure.figsize':(2.2,2)})
skplot.metrics.plot_confusion_matrix(y_test, y_pred)
plt.tight_layout()
plt.savefig('th2_98_confmatrix',dpi=120)
plt.show()

In [None]:
y_pred_prob = model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1])
auc = metrics.auc(fpr, tpr)
print('AUC=',auc)
a=auc

from sklearn.metrics import roc_curve, auc
%matplotlib inline
ll=[0,1]
ur=[0,1]

# import matplotlib.pyplot as plt
plt.figure(figsize=(4,4))
plt.rcParams["axes.edgecolor"] = '140' # 180
plt.rcParams["axes.linewidth"]  = 1.25

y_pred_prob = model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1])
plt.plot(fpr, tpr, label="AUC={:.2f}".format(a))
plt.plot(ll,ur, '--r' )   # color red
plt.title('ROC Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')  # plt.grid(False)  # plt.savefig('case4_roc', dpi=120, bbox_inches = "tight")
plt.legend(loc="lower right")
plt.show()