In [16]:
from collections import Counter
import math
import pandas as pd

def print_hi():
    return print('hi baby')

def oversample_minority_class(data, outcome, p_minority):
    '''
    Function used to oversample the minority class.

    Parameters
    ----------
    data = pandas dataframe
        dataframe to be resampled
    outcome: str
        name of the outcome column
    p_minority: float
        propotyion of the rows that we want the minority class
        to make up after we are done resampling.
    '''

    def check_p_minority_bounds(p_minority):
        if (p_minority > 1) or (p_minority < 0):
            msg = f'Proportion out of bounds ! p_minority must be between 0 and 1, but value passed was {p_minority}.'
            raise ValueError(msg)

    def check_outcome_binary(data, outcome):

        outcome_counts = Counter(data[outcome])
        n_outcomes = len(outcome_counts.keys())
        if (n_outcomes != 2):
            msg = f'Binary outcome expected but specified outcome has {n_outcomes} classes'
            raise ValueError(msg)
            
    check_p_minority_bounds(p_minority)
    check_outcome_binary(data, outcome)

    outcome_counts = Counter(data[outcome])
    majority_class, majority_count = outcome_counts.most_common()[0]
    minority_class, minority_count = outcome_counts.most_common()[-1]
    desired_total_count = math.ceil(majority_count/(1-p_minority))
    n_samples = desired_total_count - majority_count - minority_count
    samples = data.loc[data[outcome] == minority_class].sample(n_samples, replace=True)
    oversampled_data = pd.concat([data, samples])
    return oversampled_data

In [17]:
# from collections import Counter
# import math
import pandas as pd
import pytest

# @pytest.fixture
def data():
    data = pd.DataFrame({
        'y': [0,0,0,0,0,0,0,1,1,1],
        'x1': [1,2,1,2,1,2,1,5,6,5]
    })
    return data


import bank_deposit_classifier.sample as sample

def test_oversample_minority_class(data):
    data_ = sample.oversample_minority_class(data, 'y', 0.5)
    assert isinstance(data_, pd.DataFrame)

def test_oversample_minority_class_high_p(data):
    with pytest.raises(ValueError) as e:
        sample.oversample_minority_class(data, 'y', 1.5)
    assert "Proportion out of bounds" in str(e.value)

def test_oversample_minority_class_binary(data):
    with pytest.raises(ValueError) as e:
        data_ = data.loc[data['y'] == 1]
        sample.oversample_minority_class(data_, 'y', 0.5)
    assert "Binary outcome expected" in str(e.value)

In [18]:
df = data()
df

Unnamed: 0,y,x1
0,0,1
1,0,2
2,0,1
3,0,2
4,0,1
5,0,2
6,0,1
7,1,5
8,1,6
9,1,5


In [19]:
oversample_minority_class(df, 'y', 0.5)

Unnamed: 0,y,x1
0,0,1
1,0,2
2,0,1
3,0,2
4,0,1
5,0,2
6,0,1
7,1,5
8,1,6
9,1,5


In [20]:
oversample_minority_class(df, 'y', 1.5)

ValueError: Proportion out of bounds ! p_minority must be between 0 and 1, but value passed was 1.5.