In [14]:
import pandas as pd
import numpy as np

df = pd.read_csv('balance-scale.data', names=['balance', 'var1', 'var2', 'var3', 'var4'])
df

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
5,R,1,1,2,1
6,R,1,1,2,2
7,R,1,1,2,3
8,R,1,1,2,4
9,R,1,1,2,5


In [15]:
df['balance'].value_counts()

R    288
L    288
B     49
Name: balance, dtype: int64

In [21]:
df['balance'] = [ 1 if b=='B' else 0 for b in df.balance]
df['balance'].value_counts()
#df.balance

0    576
1     49
Name: balance, dtype: int64

In [31]:
#The Danger of Imbalanced Classes

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

y = df.balance
X = df.drop('balance', axis=1) 

classification_0 = LogisticRegression().fit(X, y)

pred_0 = classification_0.predict(X)

accuracy = accuracy_score(pred_0, y)
print(accuracy)

# how many classes are predicting in this model 
print( np.unique( pred_0 ) )
# it's not covering both class so not too good this model 


0.9216
[0]




In [42]:
#Up-sample Minority Class
from sklearn.utils import resample

# seperate minority and majority classes

df_majority = df[df.balance==0]
df_minority = df[df.balance==1]

df_minority_upsampled = resample(df_minority, replace=True, n_samples=576)

df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.balance.value_counts()

1    576
0    576
Name: balance, dtype: int64

In [44]:
#Up-sample Minority Class classifier

y = df_upsampled.balance
X = df_upsampled.drop('balance', axis=1)

classification_2 = LogisticRegression().fit(X, y)
pred_1 = classification_2.predict(X)
print(accuracy_score(pred_1, y))
print(np.unique(pred_1))


0.5052083333333334
[0 1]




In [50]:
# down sampling in majority class
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]

df_majority_down = resample(df_majority, replace=False, n_samples=49)

df_downsampled = pd.concat([df_majority_down, df_minority])
df_downsampled

Unnamed: 0,balance,var1,var2,var3,var4
501,0,5,1,1,2
164,0,2,2,3,5
555,0,5,3,2,1
483,0,4,5,2,4
179,0,2,3,1,5
32,0,1,2,2,3
601,0,5,5,1,2
287,0,3,2,3,3
80,0,1,4,2,1
602,0,5,5,1,3


In [57]:
y = df_downsampled.balance
X = df_downsampled.drop('balance', axis=1)

classifier_2 = LogisticRegression().fit(X, y)

predict_2 = classifier_2.predict(X)
print(accuracy_score(predict_2, y))
print(np.unique(predict_2))


0.5408163265306123
[0 1]




In [63]:
#Penalize Algorithms (Cost-Sensitive Training)

#Penalized-SVM

from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score


y = df.balance
X = df.drop('balance', axis=1)
svc_01 = SVC(kernel='linear', class_weight='balanced', probability=True)
classifier_3 = svc_01.fit(X, y)
predict_3 = classifier_3.predict(X)

print(accuracy_score(predict_3, y))
print(np.unique(predict_3))

#AUROC
prob_y_3 = classifier_3.predict_proba(X)
prob_y_3 = [p[1] for p in prob_y_3]
print(roc_auc_score(y, prob_y_3))

0.688
[0 1]
0.46947633219954643
