# Simple upscaling for unbalanced Datasets
- Author(s): Thomas Glanzer
- Creation : Nov 2021

In [163]:
# Import libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, average_precision_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
# Define seed for random states
SEED = 42

In [164]:
# Define functions

def precision_at_recall(y_true, y_proba, target_recall=0.5, return_all=False):
    """Scorer that return the precision for a given target recall value.

    Args:
        y_true:  numpy array - true y values
        y_proba: numpy array - predicted y value probabilities
        target_recall: float - target recall to achieve
        return_all: bool - whether to also return the actual target value and threshold used
    Returns:
        precision: float - the precision score at the given target recall
        recall   : float - (optional) the actual recall score (>= target recall)
        threshold: float - (optional) the threshold used for the recall/precision scores
    """
    # Store y true and proba values in a df ...
    scores = pd.DataFrame({'y_true': y_true, 'y_proba': y_proba})
    # ... sort values by descending y probas (and reset index)
    scores.sort_values(by=['y_proba', 'y_true'], ascending=False, inplace=True, ignore_index=True)
    # Create col including cummulated y true values
    scores['y_true_sum'] = scores['y_true'].cumsum()
    # Reduce df size by only keeping last value for a given proba thresh
    scores = scores.drop_duplicates(subset='y_proba', keep='last').drop('y_true', axis=1)
    # Calculate precision / recall for relevant entries
    scores['precision'] = scores.y_true_sum / (scores.index + 1)
    scores['recall'] = scores.y_true_sum / sum(y_true)

    try:
        # Get result by getting score from dataframe that meets target requirements (and downcast to float32)
        result = (scores.loc[scores.recall >= target_recall]).iloc[0].astype('float32')
        recall = result.recall
        precision = result.precision
        thresh = result.y_proba
    except IndexError:
        print('No solution could be found for given target value.')
        recall = None
        precision = None
        thresh = None

    if return_all:
        return precision, recall, thresh
    else:
        return precision


def recall_at_precision(y_true, y_proba, target_precision=0.5, return_all=False):
    """Scorer that return the precision for a given target recall value.

    Args:
        y_true:  numpy array - true y values
        y_proba: numpy array - predicted y value probabilities
        target_precision: float - target precision to achieve
        return_all: bool - whether to also return the actual target value and threshold used
    Returns:
        recall   : float - the recall score at the given target precision
        precision: float - (optional) the actual precision score (>= target precision)
        threshold: float - (optional) the threshold used for the recall/precision scores
    """
    # Store y true and proba values in a df ...
    scores = pd.DataFrame({'y_true': y_true, 'y_proba': y_proba})
    # ... sort values by descending y probas (and reset index)
    scores.sort_values(by=['y_proba', 'y_true'], ascending=False, inplace=True, ignore_index=True)
    # Create col including cummulated y true values
    scores['y_true_sum'] = scores['y_true'].cumsum()
    # Reduce df size by only keeping last value for a given proba thresh
    scores = scores.drop_duplicates(subset='y_proba', keep='last').drop('y_true', axis=1)
    # Calculate precision / recall for relevant entries
    scores['precision'] = scores.y_true_sum / (scores.index + 1)
    scores['recall'] = scores.y_true_sum / sum(y_true)

    # Get result by getting score from dataframe that meets target requirements (and downcast to float32)
    try:
        result = (scores.loc[scores.precision >= target_precision]).iloc[-1].astype('float32')
        recall = result.recall
        precision = result.precision
        thresh = result.y_proba
    except IndexError:
        print('No solution could be found for given target value.')
        recall = None
        precision = None
        thresh = None

    if return_all:
        return recall, precision, thresh
    else:
        return recall

In [165]:
# Load Healtcare-Stroke dataset (Source: Kaggle) and drop the included id column
df = pd.read_csv('.%sdata%shealthcare-dataset-stroke-data.csv' %(os.sep, os.sep), index_col='id')
# .... now show first three entries with column names
print(df.head(3).T)

id                           9046           51676         31112
gender                        Male         Female          Male
age                           67.0           61.0          80.0
hypertension                     0              0             0
heart_disease                    1              0             1
ever_married                   Yes            Yes           Yes
work_type                  Private  Self-employed       Private
Residence_type               Urban          Rural         Rural
avg_glucose_level           228.69         202.21        105.92
bmi                           36.6            NaN          32.5
smoking_status     formerly smoked   never smoked  never smoked
stroke                           1              1             1


In [166]:
# Lets define the stroke column as label 
label = 'stroke'
# ... now show the label-balance of the dataset
print('Label Balance in per Cent:')
print(df[label].value_counts()*100/(len(df)))
# ... also print NA values (to check if imputation has to be performed)
print('\nNA Values:')
print(df.isna().sum())


Label Balance in per Cent:
0    95.127202
1     4.872798
Name: stroke, dtype: float64

NA Values:
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [167]:
# Create base y, X ...
y = df[label]
X = df.drop([label], axis=1)
# ... then make a Train and Test sets by applying a stratified train test split with a 70:30 distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)
# ... apply one-hot-encoding
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Identify columns that have low information value / may cause overfitting
drop_col_list = []
for col in X_train.select_dtypes(exclude='float').columns.tolist():
    if (X_train[col].value_counts()[1] <= 5):
        drop_col_list.append(col)
# Drop them from both train and test data
X_train = X_train.drop(drop_col_list, axis=1, errors='ignore')
X_test = X_test.drop(drop_col_list, axis=1, errors='ignore')
# Fill NA Values (because we use a RF, we let the RF handle the meaning of a missing value):
X_train = X_train.fillna(-1)
X_test = X_test.fillna(-1)


In [183]:
# Define a classifier with default config and train it
clf = RandomForestClassifier(random_state=SEED, n_jobs=-1)
clf.fit(X_train, y_train)

# Make predictions for the classifier, using a 0.5 thresh
thresh = 0.2
y_proba = clf.predict_proba(X_test)[:,1]
y_pred = (y_proba >= thresh)
# Show scores for the test set
for scorer in [balanced_accuracy_score, precision_score, recall_score, average_precision_score]:
    print(str(scorer.__name__), round(scorer(y_test,y_pred),3))
# Create custom classifier
print('precision_at_0_5_recall', precision_at_recall(y_test, y_proba, target_recall=0.10))
print('recall_at_0_5_precision', recall_at_precision(y_test, y_proba, target_precision=0.3))

balanced_accuracy_score 0.602
precision_score 0.18
recall_score 0.267
average_precision_score 0.084
precision_at_0_5_recall 0.30769232
recall_at_0_5_precision 0.10666667


Unnamed: 0_level_0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Male,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
35602,52.0,0,0,107.84,22.0,0,1,0,0,0,0,0,1,0,0,0
56137,62.0,0,0,88.32,36.3,0,1,0,1,0,0,1,0,0,0,0
14063,81.0,0,1,95.49,29.4,1,0,0,0,1,0,0,0,0,0,0
12857,55.0,0,0,73.57,28.0,1,1,0,0,1,0,0,0,0,1,0
5875,37.0,0,0,103.66,36.1,0,1,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67217,45.0,0,0,92.86,35.1,0,1,0,1,0,0,1,1,0,0,0
34376,16.0,0,0,113.47,19.5,0,0,0,0,0,1,0,0,0,0,0
10752,61.0,0,0,78.65,36.2,0,1,0,1,0,0,0,1,0,0,0
32503,80.0,0,0,76.57,34.1,0,1,0,0,1,0,1,0,1,0,1


In [229]:
# TODO not working yet
# Data Generation and Upscaling
# ... set upscale label (which values should be upscaled)
upsc_label = 1
# ... set upscale factor
upsc_factor = 15
# ... generate synthethic data
df_ext = pd.concat([X_train, y_train], axis=1)
df_ext = pd.concat([df_ext.loc[df_ext[label] == upsc_label]] * upsc_factor, ignore_index=True)
# ... identify suitable columns
float_cols = list(X_train.select_dtypes(include='float').columns)

# ..Generate gaussian noise mask with center 1 and a given stdev with shape of given df
noise = np.random.normal(1, 0.1, X_train_ext[float_cols].shape)
# Keep first iteration of input data unchanged (mask value = 1)
noise[:len(X_train),] = 1
# Show noise and max/min values of distribution
X_train_ext[float_cols] = X_train_ext[float_cols] * noise
print(X_train_ext)
#pd.DataFrame(noise).plot.hist(bins=20)

             age  hypertension  heart_disease  avg_glucose_level        bmi  \
0      52.000000             0              0         107.840000  22.000000   
1      62.000000             0              0          88.320000  36.300000   
2      81.000000             0              1          95.490000  29.400000   
3      55.000000             0              0          73.570000  28.000000   
4      37.000000             0              0         103.660000  36.100000   
...          ...           ...            ...                ...        ...   
53650  50.280739             0              0          89.586885  38.047127   
53651  14.507317             0              0         121.660465  21.405410   
53652  63.254259             0              0          73.965026  35.059362   
53653  64.658763             0              0          81.554152  39.948909   
53654   7.287312             0              0          82.802560  17.085427   

       gender_Male  ever_married_Yes  work_type_Nev