In [1]:
import numpy as np
from numpy.random import multivariate_normal
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
import plotly.plotly as py
from plotly.graph_objs import Scatter, Data

def plot_data(X, y):
    '''
    Plots 2-dimensional data containing two classes
    '''
    scatter0 = Scatter(x = X[:,0][y==0],
                       y = X[:,1][y==0],
                       mode = 'markers')
    scatter1 = Scatter(x = X[:,0][y==1],
                       y = X[:,1][y==1],
                       mode = 'markers')
    data = Data([scatter0, scatter1])   
    return data

In [2]:
# Generate dataset
def generate_data(positive_qty, negative_qty):
    '''
    Draws from two 2-dimensional Gaussians
    '''
    X = np.vstack((multivariate_normal((-10, -10), 
                                       np.diag([10, 10]), 
                                       positive_qty),
                   multivariate_normal((10, 10),
                                       np.diag([70, 70]), 
                                       negative_qty)))
    y = np.array([1]*positive_qty + [0]*negative_qty)
    
    return X, y

positive_qty_train = 10
negative_qty_train = positive_qty_train*100
positive_qty_test = 100000
negative_qty_test = positive_qty_test*100
X, y = generate_data(positive_qty_train, negative_qty_train)
X_test, y_test = generate_data(positive_qty_test, negative_qty_test)

data = plot_data(X, y)
py.iplot(data)

In [3]:
# Basic Logistic Regression
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X_test)
print 'Basic Logistic Regression Precision', precision_score(y_test, y_pred)
print 'Basic Logistic Regression Recall', recall_score(y_test, y_pred)
print "Basic Logistic Regression F1 Score", f1_score(y_test, y_pred)

Basic Logistic Regression Precision 0.841010711234
Basic Logistic Regression Recall 0.84641
Basic Logistic Regression F1 Score 0.843701717487


In [4]:
# Logistic regression with undersampling
def undersample(X, y, majority_weight=.5):
    '''
    randomly discards observations from majority class
    so that output X, y have specified percentage of majority observations
    '''
    class_ratio = y.sum()/float(len(y))
    majority_class_label = round(class_ratio)
    X_majority = X[y==majority_class_label]
    y_majority = y[y==majority_class_label]
    maj_count = len(X_majority)
    min_count = len(X) - maj_count
    scaling_factor = (min_count/float(maj_count))*(majority_weight/(1-majority_weight))
    sample_indices = np.random.choice(xrange(maj_count), 
                                      size=round(maj_count*scaling_factor),
                                      replace=False)
    X_majority = X_majority[sample_indices]
    y_majority = y_majority[sample_indices]
    X = np.vstack((X_majority, X[y!=majority_class_label]))
    y = np.hstack((y_majority, y[y!=majority_class_label]))
    return X, y

X_undersampled, y_undersampled = undersample(X, y, majority_weight=.5)
undersampled_model = LogisticRegression()
undersampled_model.fit(X_undersampled, y_undersampled)
y_pred = undersampled_model.predict(X_test)
print 'Undersampled Logistic Regression Precision', precision_score(y_test, y_pred)
print 'Undersampled Logistic Regression Recall', recall_score(y_test, y_pred)
print "Undersampled Logistic Regression F1 Score", f1_score(y_test, y_pred)

data = plot_data(X_undersampled, y_undersampled)
py.iplot(data)

Undersampled Logistic Regression Precision 0.124622034545
Undersampled Logistic Regression Recall 0.99987
Undersampled Logistic Regression F1 Score 0.221621549735



using a non-integer number instead of an integer will result in an error in the future



In [5]:
# Logistic regression with oversampling
def oversample(X, y, minority_weight=.5):
    '''
    duplicates observations from minority class
    so that output X, y have specified percentage of majority observations
    '''
    class_ratio = y.sum()/float(len(y))
    majority_class_label = round(class_ratio)
    X_minority = X[y!=majority_class_label]
    y_minority = y[y!=majority_class_label]
    min_count = len(X_minority)
    maj_count = len(X) - min_count
    scaling_factor = round((maj_count/float(min_count))*(minority_weight/(1-minority_weight)))
    X_minority = np.tile(X_minority, (scaling_factor, 1))
    y_minority = np.tile(y_minority, scaling_factor)
    X = np.vstack((X_minority, X[y==majority_class_label]))
    y = np.hstack((y_minority, y[y==majority_class_label]))
    return X, y

X_oversampled, y_oversampled = oversample(X, y, minority_weight=.5)
oversampled_model = LogisticRegression()
oversampled_model.fit(X_oversampled, y_oversampled)

y_pred = oversampled_model.predict(X_test)
print 'Oversampled Logistic Regression Precision', precision_score(y_test, y_pred)
print 'Oversampled Logistic Regression Recall', recall_score(y_test, y_pred)
print "Oversampled Logistic Regression F1 Score", f1_score(y_test, y_pred)


Oversampled Logistic Regression Precision 0.450304003464
Oversampled Logistic Regression Recall 0.99836
Oversampled Logistic Regression F1 Score 0.620662215425


In [6]:
# Logistic regression with SMOTE (Synthetic Minority Oversampling Technique)

def smote(X, y, minority_weight=.5):
    '''
    generates new observations in minority class
    so that output X, y have specified percentage of majority observations
    '''
    # compute number of new examples required
    class_ratio = y.sum()/float(len(y))
    majority_class_label = round(class_ratio)
    X_minority = X[y!=majority_class_label]
    y_minority = y[y!=majority_class_label]
    min_count = len(X_minority)
    maj_count = len(X) - min_count
    scaling_factor = (maj_count/float(min_count))*(minority_weight/(1-minority_weight))
    new_observations_target = round(scaling_factor*min_count) - min_count

    # train KNN
    knn_model = KNeighborsClassifier(n_neighbors=int(round(len(X_minority)**.5)))
    knn_model.fit(X_minority, y_minority)
    if new_observations_target < len(X_minority):
        sample_indices = np.random.choice(xrange(X_minority), 
                                          size=new_observations_target,
                                          replace=False)
        smote_samples = X_minority[sample_indices]
    else:
        smote_samples = X_minority
    neighbors = knn_model.kneighbors(smote_samples)[1]
    
    # generate new samples
    new_observations = np.empty((0,X.shape[1]))
    while len(new_observations) < new_observations_target:
        index = len(new_observations) % len(smote_samples)
        neighbor_index = np.random.choice(neighbors[index])
        neighbor = smote_samples[neighbor_index]
        x = X_minority[index]
        new_x = x + (neighbor - x)*np.random.random(size=X_minority.shape[1])
        new_observations = np.vstack((new_observations, new_x))
    minority_class_label = (majority_class_label + 1) % 2
    X = np.vstack((X, new_observations))
    y = np.hstack((y, np.array([minority_class_label]*len(new_observations))))
    
    return X, y

X_smote, y_smote = smote(X, y, minority_weight=.5)
smote_model = LogisticRegression()
smote_model.fit(X_smote, y_smote)
y_pred = smote_model.predict(X_test)
print 'SMOTE Logistic Regression Precision', precision_score(y_test, y_pred)
print 'SMOTE Logistic Regression Recall', recall_score(y_test, y_pred)
print "SMOTE Logistic Regression F1 Score", f1_score(y_test, y_pred)

data = plot_data(X_smote, y_smote)
py.iplot(data)

SMOTE Logistic Regression Precision 0.460689406844
SMOTE Logistic Regression Recall 0.99795
SMOTE Logistic Regression F1 Score 0.630375117254
