In [1]:
from sklearn.datasets import make_classification
from matplotlib import pyplot as plt
import skoot
%matplotlib inline

print("Skoot version: %s" % skoot.__version__)

Skoot version: 0.19.2-dev1


# Create some imbalanced data

In [3]:
from sklearn.model_selection import train_test_split

# Create the dataset
seed = 42
X, y = make_classification(random_state=seed, n_features=2, n_redundant=0, 
                           n_repeated=0, n_samples=100000, n_classes=2,
                           weights=[.999])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=seed,
                                                    stratify=y)

# Make sure we have positive classes in train/test
print("n_positive (train): %i" % (y_train == 1).sum())
print("n_positive (test): %i" % (y_test == 1).sum())

n_positive (train): 467
n_positive (test): 117


# Model with no balancing

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression().fit(X_train, y_train)
preds = clf.predict(X_test)
print("Accuracy: %.3f" % accuracy_score(y_test, preds))
print("Num positive predictions: %i" % (preds == 1).sum())

Accuracy: 0.994
Num positive predictions: 0


## Balance our classes

In [9]:
from skoot.balance import under_sample_balance
from skoot.balance import over_sample_balance
from skoot.balance import smote_balance

# Create an under-sampled set
X_train_under, y_train_under = under_sample_balance(X_train, y_train, 
                                                    balance_ratio=0.3, 
                                                    random_state=seed)

# Create an over-sampled set
X_train_over, y_train_over = over_sample_balance(X_train, y_train, 
                                                 balance_ratio=0.3, 
                                                 random_state=seed)

# Create a SMOTE-set
X_train_smote, y_train_smote = smote_balance(X_train, y_train,
                                             balance_ratio=0.3, 
                                             n_neighbors=10,
                                             random_state=seed)

### Fit & evaluate balanced datasets/models

In [12]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

def do_fit(train_array, train_labels):
    # Get CV scores
    print(cross_val_score(LogisticRegression(), train_array, train_labels,
                          scoring="accuracy", 
                          cv=StratifiedKFold(n_splits=3, random_state=seed)))
    
    clf = LogisticRegression().fit(train_array, train_labels)
    preds = clf.predict(X_test)
    print("Num positive predictions: %i" % (preds == 1).sum())
    print()
    
do_fit(X_train_under, y_train_under)
do_fit(X_train_over, y_train_over)
do_fit(X_train_smote, y_train_smote)

[ 0.77777778  0.78518519  0.77860327]
Num positive predictions: 34

[ 0.77730385  0.77689763  0.77747795]
Num positive predictions: 16

[ 0.77846448  0.77741992  0.77797122]
Num positive predictions: 12

