### 0. Load dependencies

In [1]:
import pandas as pd
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import pickle

### 1. Load data

In [2]:
# Load data
names = ['age','workclass','fnlwgt','education','education-num','marital-status',
         'occupation','relationship', 'race','sex','capital-gain','capital-loss',
         'hours-per-week','native-country','income']

train = pd.read_csv('data/adult/adult.data', sep=',\s', names=names, header=None)
test = pd.read_csv('data/adult/adult.test', sep=',\s', names=names, header=None)
train_len = train.shape[0]
data = pd.concat([train, test], axis=0)

print('original_train.shape', train.shape,'\n',
      'original_test.shape', test.shape,'\n',
      'original_data.shape', data.shape)

  train = pd.read_csv('data/adult/adult.data', sep=',\s', names=names, header=None)
  test = pd.read_csv('data/adult/adult.test', sep=',\s', names=names, header=None)


original_train.shape (32561, 15) 
 original_test.shape (16281, 15) 
 original_data.shape (48842, 15)


### 2. Preprocessing and data augmentation

In [3]:
# Preprocessing

# drop null values
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)
# drop duplicated rows
data = data.drop_duplicates()
# replace "<=50k" and ">50k" with 0 and 1
data.replace(to_replace=['<=50K', '<=50K.', '>50K', '>50K.'], 
             value=[0, 0, 1, 1], inplace=True)
# encode categorical features
categorical_df = data.select_dtypes(object)
numerical_df = data.select_dtypes(int, float)
lab_enc = LabelEncoder()

for column in categorical_df:
    categorical_df[column] = lab_enc.fit_transform(categorical_df[column])

data = pd.concat([categorical_df, numerical_df], axis=1)

In [4]:
# Sanity check: data imbalanced

train, test = data[:train_len], data[train_len:]

print(sum(train['income'] == 0) / len(train))
print(sum(test['income'] == 0) / len(test))

0.7508676023463653
0.755085886171139


In [5]:
# train test split

train, test = data[:train_len], data[train_len:]
n_samples = 20
ratio = 0.80

n_trn_per_sample = 1000
n_trn0 = int(n_trn_per_sample * ratio)
n_trn1 = n_trn_per_sample - n_trn0

n_tst_per_sample = 500
n_tst0 = int(n_tst_per_sample * ratio)
n_tst1 = n_tst_per_sample - n_tst0

num_neighbors = 3

for i in range(n_samples):
    
    print(' ')
    print('sample', i+1, '...')
    
    train0 = train[train['income'] == 0]
    train1 = train[train['income'] == 1]
    
    test0 = test[test['income'] == 0]
    test1 = test[test['income'] == 1]
    
    train0_sample = train0.sample(n=n_trn0, replace=False, random_state=i)
    train1_sample = train1.sample(n=n_trn1, replace=False, random_state=i)
    train_sample = pd.concat([train0_sample, train1_sample])
    
    
    test0_sample = test0.sample(n=n_tst0, replace=False, random_state=i)
    test1_sample = test1.sample(n=n_tst1, replace=False, random_state=i)
    test_sample = pd.concat([test0_sample, test1_sample])
    
    X_trn, y_trn = train_sample[train_sample.columns.drop('income')], train_sample['income']
    scaler = MinMaxScaler()
    X_trn = scaler.fit_transform(X_trn)
    y_trn = np.asarray(y_trn)

    X_val, y_val = test_sample[test_sample.columns.drop('income')], test_sample['income']
    scaler = MinMaxScaler()
    X_val = scaler.fit_transform(X_val)
    y_val = np.asarray(y_val)

    print('X_trn.shape', X_trn.shape, 'y_trn.shape', y_trn.shape, 
          'X_val.shape', X_val.shape, 'y_val.shape', y_val.shape)
    
    # Data augmentation using BorderlineSMOTE
    sm = BorderlineSMOTE(random_state=i, k_neighbors=num_neighbors)
    X_res, y_res = sm.fit_resample(X_trn, y_trn)
    X_ori = X_res[:len(X_trn)]
    y_ori = y_res[:len(y_trn)]
    X_aug = X_res[len(X_trn):]
    y_aug = y_res[len(y_trn):]
    X_ori_aug = X_res
    y_ori_aug = y_res
    
    print('X_ori', X_ori.shape, 'y_ori', y_ori.shape, 
          'X_aug', X_aug.shape, 'y_aug', y_aug.shape,
          'X_ori_aug', X_ori_aug.shape, 'y_ori_aug', y_ori_aug.shape)
    
    mydict = {'X_ori': X_ori, 'X_aug': X_aug, 'X_ori_aug': X_ori_aug, 'X_val': X_val,
              'y_ori': y_ori, 'y_aug': y_aug, 'y_ori_aug': y_ori_aug, 'y_val': y_val}
    
    with open("data/adult_imbalanced_sample/augmented_sample{}_ratio={}_num_trn_per_sample={}_num_tst_per_sample={}.pickle".format(i+1, ratio, n_trn_per_sample, n_tst_per_sample), "wb") as fp:
        pickle.dump(mydict, fp)

 
sample 1 ...
X_trn.shape (1000, 14) y_trn.shape (1000,) X_val.shape (500, 14) y_val.shape (500,)
X_ori (1000, 14) y_ori (1000,) X_aug (600, 14) y_aug (600,) X_ori_aug (1600, 14) y_ori_aug (1600,)
 
sample 2 ...
X_trn.shape (1000, 14) y_trn.shape (1000,) X_val.shape (500, 14) y_val.shape (500,)
X_ori (1000, 14) y_ori (1000,) X_aug (600, 14) y_aug (600,) X_ori_aug (1600, 14) y_ori_aug (1600,)
 
sample 3 ...
X_trn.shape (1000, 14) y_trn.shape (1000,) X_val.shape (500, 14) y_val.shape (500,)
X_ori (1000, 14) y_ori (1000,) X_aug (600, 14) y_aug (600,) X_ori_aug (1600, 14) y_ori_aug (1600,)
 
sample 4 ...
X_trn.shape (1000, 14) y_trn.shape (1000,) X_val.shape (500, 14) y_val.shape (500,)
X_ori (1000, 14) y_ori (1000,) X_aug (600, 14) y_aug (600,) X_ori_aug (1600, 14) y_ori_aug (1600,)
 
sample 5 ...
X_trn.shape (1000, 14) y_trn.shape (1000,) X_val.shape (500, 14) y_val.shape (500,)
X_ori (1000, 14) y_ori (1000,) X_aug (600, 14) y_aug (600,) X_ori_aug (1600, 14) y_ori_aug (1600,)
 
sample 6

### 3. Test the performance of logistic regression

In [6]:
# Performance of the logistic regression

from sklearn.linear_model import LogisticRegression

for i in range(n_samples):
    
    with open("data/adult_imbalanced_sample/augmented_sample{}_ratio={}_num_trn_per_sample={}_num_tst_per_sample={}.pickle".format(i+1, ratio, n_trn_per_sample, n_tst_per_sample), "rb") as fp:
        mydict = pickle.load(fp)
    
    X_ori = mydict['X_ori']
    y_ori = mydict['y_ori']
    X_aug = mydict['X_aug']
    y_aug = mydict['y_aug']
    X_ori_aug = mydict['X_ori_aug']
    y_ori_aug = mydict['y_ori_aug']
    X_val = mydict['X_val']
    y_val = mydict['y_val']
    
    print(' ')
    print('Sample', i+1)
    
    lr = LogisticRegression(solver='liblinear', max_iter=5000, random_state=666)
    lr.fit(X_ori, y_ori)
    preds = lr.predict(X_val)
    print('AUC score for original dataset:', round(roc_auc_score(y_val, preds), 4))

    lr = LogisticRegression(solver='liblinear', max_iter=5000, random_state=666)
    lr.fit(X_ori_aug, y_ori_aug)
    preds = lr.predict(X_val)
    print('AUC score for original and augmented dataset:', round(roc_auc_score(y_val, preds), 4))

 
Sample 1
AUC score for original dataset: 0.6375
AUC score for original and augmented dataset: 0.7125
 
Sample 2
AUC score for original dataset: 0.56
AUC score for original and augmented dataset: 0.7012
 
Sample 3
AUC score for original dataset: 0.6325
AUC score for original and augmented dataset: 0.7337
 
Sample 4
AUC score for original dataset: 0.59
AUC score for original and augmented dataset: 0.7312
 
Sample 5
AUC score for original dataset: 0.655
AUC score for original and augmented dataset: 0.745
 
Sample 6
AUC score for original dataset: 0.6525
AUC score for original and augmented dataset: 0.755
 
Sample 7
AUC score for original dataset: 0.6575
AUC score for original and augmented dataset: 0.7813
 
Sample 8
AUC score for original dataset: 0.6175
AUC score for original and augmented dataset: 0.6838
 
Sample 9
AUC score for original dataset: 0.5913
AUC score for original and augmented dataset: 0.7388
 
Sample 10
AUC score for original dataset: 0.6
AUC score for original and augme