In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn import preprocessing
import pandas as pd

## SA Heart

In [2]:
df = pd.read_csv("SAheart.data",sep=',')

In [3]:
df.columns

Index(['sbp', 'tobacco', 'ldl', 'adiposity', 'famhist', 'typea', 'obesity',
       'alcohol', 'age', 'chd'],
      dtype='object')

In [4]:
le = preprocessing.LabelEncoder()
df.loc[:,'famhist'] = le.fit_transform(df.famhist)
list(le.inverse_transform([0,1]))

['Absent', 'Present']

In [5]:
X = df.drop(['chd'],axis=1)
y = df['chd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)

In [6]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

pred_train = sgd_clf.predict(X_train)
pred_test = sgd_clf.predict(X_test)

In [7]:
cm = confusion_matrix(y_test, pred_test)
print(cm)

[[30 18]
 [ 5 17]]


In [8]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67       254
           1       0.50      0.78      0.60       138

    accuracy                           0.64       392
   macro avg       0.66      0.67      0.64       392
weighted avg       0.71      0.64      0.65       392



In [9]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.86      0.62      0.72        48
           1       0.49      0.77      0.60        22

    accuracy                           0.67        70
   macro avg       0.67      0.70      0.66        70
weighted avg       0.74      0.67      0.68        70



In [10]:
print(f1_score(y_train, pred_train))
print(f1_score(y_test, pred_test))

0.6045197740112994
0.5964912280701754


## PIMA Indians Diabetes 

In [11]:
df = pd.read_csv("pima-indians-diabetes.data.csv",sep=',')

In [24]:
df.Class.value_counts(0)

0    500
1    268
Name: Class, dtype: int64

In [14]:
X = df.drop(["Class"], axis=1)
y = df.Class

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)

In [16]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

pred_train = sgd_clf.predict(X_train)
pred_test = sgd_clf.predict(X_test)

In [17]:
cm_train = confusion_matrix(y_train, pred_train)
cm_train

array([[ 65, 357],
       [ 14, 216]])

In [18]:
cm_test = confusion_matrix(y_test, pred_test)
cm_test

array([[13, 65],
       [ 1, 37]])

In [19]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.82      0.15      0.26       422
           1       0.38      0.94      0.54       230

    accuracy                           0.43       652
   macro avg       0.60      0.55      0.40       652
weighted avg       0.67      0.43      0.36       652



In [20]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.93      0.17      0.28        78
           1       0.36      0.97      0.53        38

    accuracy                           0.43       116
   macro avg       0.65      0.57      0.41       116
weighted avg       0.74      0.43      0.36       116



In [21]:
print(f1_score(y_train, pred_train))

0.5379825653798256


In [22]:
print(f1_score(y_test, pred_test))

0.5285714285714286
