In [1]:
import numpy as np
import pandas as pd

#### Dataset Information:

http://archive.ics.uci.edu/ml/datasets/balance+scale

The dataset contains information about whether a scale is balanced or not, based on weights and distances of the two arms

In [2]:
data = pd.read_csv('balance-scale.data',names=['Class','LW','LD','RW','RD'])

In [3]:
data.head(4)

Unnamed: 0,Class,LW,LD,RW,RD
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625 entries, 0 to 624
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Class   625 non-null    object
 1   LW      625 non-null    int64 
 2   LD      625 non-null    int64 
 3   RW      625 non-null    int64 
 4   RD      625 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 24.5+ KB


In [5]:
data['Class'].value_counts()

L    288
R    288
B     49
Name: Class, dtype: int64

We can convert Target variable into binary classes as follows :
    
    if Class = 'B' then 1 else 0

In [6]:
clist=[]
for c in data.Class:
    if c=='B':
        clist.append(1)
    else:
        clist.append(0)
        
data['Class'] = clist

data['Class'].value_counts()

0    576
1     49
Name: Class, dtype: int64

As we can see that there is huge imbalance in the target variable Class

In [7]:
model_details = pd.DataFrame(columns = ['Model','Strategy Used','Train accuracy','Test accuracy'])
model_details

Unnamed: 0,Model,Strategy Used,Train accuracy,Test accuracy


### Model buliding using imbalanced dataset:

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [9]:
x=data[data.columns.difference(['Class'])]
y=data['Class']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=100)

print("length of training set: ",len(x_train))
print("length of testing set: ",len(x_test))

clf_logReg1 = LogisticRegression(solver='liblinear').fit(x_train,y_train)

y_train_pred = clf_logReg1.predict(x_train)
y_test_pred =  clf_logReg1.predict(x_test)

train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)

print(np.unique(y_train_pred))
print(np.unique(y_test_pred))

length of training set:  468
length of testing set:  157
[0]
[0]


Here we can see that model is only predicting the majority class that is why it is giving more accuracy in both train and test set.

In [10]:
model_details = model_details.append({'Model':'clf_logReg1','Strategy Used':'NA','Train accuracy':train_accuracy,'Test accuracy':test_accuracy},
                     ignore_index=True)

model_details

Unnamed: 0,Model,Strategy Used,Train accuracy,Test accuracy
0,clf_logReg1,,0.916667,0.936306


Now we will implement different method of handling imbalanced datasets

### 1. Up-Sampling Minority Class

By Up-sampling minority class means that we will select those records which have target labeled as minority class by random resampling with replacement

In [11]:
from sklearn.utils import resample

In [20]:
data.Class.value_counts()

0    576
1     49
Name: Class, dtype: int64

In [12]:
##step-1 Separate out data points having minority and majority classes

data_majority = data[data['Class']==0] ## majority class
data_minority = data[data['Class']==1] ## minority class

## step-2 upsampling minority class

data_minority_resampled = resample(data_minority,replace = True, n_samples = 576, random_state=100)

## step-3 combining dataframe 

data_upsampled = pd.concat([data_majority,data_minority_resampled],axis=0)

data_upsampled['Class'].value_counts()

1    576
0    576
Name: Class, dtype: int64

Now we can see that both classes are balanced in dataset so we will  build model on this dataset

In [13]:
x=data_upsampled[data_upsampled.columns.difference(['Class'])]
y=data_upsampled['Class']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=100)

print("length of training set: ",len(x_train))
print("length of testing set: ",len(x_test))

clf_logReg2 = LogisticRegression(solver='liblinear').fit(x_train,y_train)

y_train_pred = clf_logReg2.predict(x_train)
y_test_pred =  clf_logReg2.predict(x_test)

train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)

print(np.unique(y_train_pred))
print(np.unique(y_test_pred))

length of training set:  864
length of testing set:  288
[0 1]
[0 1]


Now we can see that model is predicting both classes

In [14]:
model_details = model_details.append({'Model':'clf_logReg2','Strategy Used':'Up sampling minority class','Train accuracy':train_accuracy,'Test accuracy':test_accuracy},
                     ignore_index=True)

model_details

Unnamed: 0,Model,Strategy Used,Train accuracy,Test accuracy
0,clf_logReg1,,0.916667,0.936306
1,clf_logReg2,Up sampling minority class,0.481481,0.458333


### 2. Down-Sampling Majority Class

By down sample we means that randomly remove majority class labeled records to match number of reords with minority class

In [29]:
##step-1 : down sample majority class
data_majority_resampled  = resample(data_majority, replace=False, n_samples = 49, random_state=100)

##step-2 : combine downsampled majority class + minority class to form a new balanced dataset
data_downsampled = pd.concat([data_minority,data_majority_resampled],axis=0)

data_downsampled['Class'].value_counts()

1    49
0    49
Name: Class, dtype: int64

Now we can see that both classes are balanced in dataset so we will  build model on this dataset

In [16]:
x=data_downsampled[data_downsampled.columns.difference(['Class'])]
y=data_downsampled['Class']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=100)

print("length of training set: ",len(x_train))
print("length of testing set: ",len(x_test))

clf_logReg3 = LogisticRegression(solver='liblinear').fit(x_train,y_train)

y_train_pred = clf_logReg3.predict(x_train)
y_test_pred =  clf_logReg3.predict(x_test)

train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)

print(np.unique(y_train_pred))
print(np.unique(y_test_pred))

length of training set:  73
length of testing set:  25
[0 1]
[0]


In [17]:
model_details = model_details.append({'Model':'clf_logReg3','Strategy Used':'Down sampling majority class','Train accuracy':train_accuracy,'Test accuracy':test_accuracy},
                     ignore_index=True)

model_details

Unnamed: 0,Model,Strategy Used,Train accuracy,Test accuracy
0,clf_logReg1,,0.916667,0.936306
1,clf_logReg2,Up sampling minority class,0.481481,0.458333
2,clf_logReg3,Down sampling majority class,0.520548,0.4


### 3. Choose Different Evaluation Metric

We can select different metric such as <b>*AUC-ROC*</b> which is usually used to represent likelihood of a model distinguishing obersavations from two classes

In [18]:
from sklearn.metrics import roc_auc_score

In [19]:
y_train_aucroc = clf_logReg3.predict_proba(x_train)
y_train_score = [score[1] for score in y_train_aucroc]

print(roc_auc_score(y_train,y_train_score))

y_test_aucroc = clf_logReg3.predict_proba(x_test)
y_test_score = [score[1] for score in y_test_aucroc]

print(roc_auc_score(y_test,y_test_score))

0.5467571644042233
0.6133333333333333


In [20]:
model_details = model_details.append({'Model':'clf_logReg2','Strategy Used':'AUC-ROC Metric','Train accuracy':train_accuracy,'Test accuracy':test_accuracy},
                     ignore_index=True)

model_details

Unnamed: 0,Model,Strategy Used,Train accuracy,Test accuracy
0,clf_logReg1,,0.916667,0.936306
1,clf_logReg2,Up sampling minority class,0.481481,0.458333
2,clf_logReg3,Down sampling majority class,0.520548,0.4
3,clf_logReg2,AUC-ROC Metric,0.520548,0.4


### 4. Use Penalize Algorithm ( Cost-sensitive Training)

In [21]:
from sklearn.svm import SVC

In [22]:
data['Class'].value_counts()

0    576
1     49
Name: Class, dtype: int64

Now we will build Support vector machine classifier on imbalanced dataset 

In [23]:
x=data[data.columns.difference(['Class'])]
y=data['Class']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=100)

print("length of training set: ",len(x_train))
print("length of testing set: ",len(x_test))

## setting class_weight='balanced' -> it will increase cost of misclassification on minority class

clf_svc = SVC(class_weight='balanced',probability=True).fit(x_train,y_train)

y_train_pred = clf_svc.predict(x_train)
y_test_pred =  clf_svc.predict(x_test)

train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)

print(np.unique(y_train_pred))
print(np.unique(y_test_pred))


length of training set:  468
length of testing set:  157
[0 1]
[0 1]


In [24]:
model_details = model_details.append({'Model':'clf_svc','Strategy Used':'Penalized-SVM Classifier','Train accuracy':train_accuracy,'Test accuracy':test_accuracy},
                     ignore_index=True)

model_details

Unnamed: 0,Model,Strategy Used,Train accuracy,Test accuracy
0,clf_logReg1,,0.916667,0.936306
1,clf_logReg2,Up sampling minority class,0.481481,0.458333
2,clf_logReg3,Down sampling majority class,0.520548,0.4
3,clf_logReg2,AUC-ROC Metric,0.520548,0.4
4,clf_svc,Penalized-SVM Classifier,0.688034,0.713376


### 5.Use Tree-based Models

Now we can use ensemble learning such as random forest classifier 

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
clf_rf = RandomForestClassifier().fit(x_train,y_train)

y_train_pred = clf_rf.predict(x_train)
y_test_pred =  clf_rf.predict(x_test)

train_accuracy = accuracy_score(y_train,y_train_pred)
test_accuracy = accuracy_score(y_test,y_test_pred)

print(np.unique(y_train_pred))
print(np.unique(y_test_pred))

[0 1]
[0 1]


In [28]:
model_details = model_details.append({'Model':'clf_rf','Strategy Used':'Random forest classifier','Train accuracy':train_accuracy,'Test accuracy':test_accuracy},
                     ignore_index=True)

model_details

Unnamed: 0,Model,Strategy Used,Train accuracy,Test accuracy
0,clf_logReg1,,0.916667,0.936306
1,clf_logReg2,Up sampling minority class,0.481481,0.458333
2,clf_logReg3,Down sampling majority class,0.520548,0.4
3,clf_logReg2,AUC-ROC Metric,0.520548,0.4
4,clf_svc,Penalized-SVM Classifier,0.688034,0.713376
5,clf_rf,Random forest classifier,1.0,0.923567


Here we can see that ensemble technique has given better accuracy on test set