#### **Optional**: Upsampling

**CAUTION**: The following cell takes $\approx 6$ minutes to run.

In [None]:
run_smote = False
if run_smote:
    # define categorical features for SMOTE-NC
    categorical_columns = list(set(X.columns) - set(numerical_columns))
    # convert categorical feature names to indices, as SMOTENC expects indices
    categorical_indices = [X.columns.get_loc(col) for col in categorical_columns]
    # split the data into two chunks
    X_1, X_2, y_1, y_2 = train_test_split(X, y, test_size=0.5, random_state=42)
    # define SMOTE-NC
    smote_nc = SMOTENC(categorical_features=categorical_indices, random_state=42)
    # apply SMOTE-NC to first chunk
    X_smote_1, y_smote_1 = smote_nc.fit_resample(X_1, y_1)
    # apply SMOTE-NC to second chunk
    X_smote_2, y_smote_2 = smote_nc.fit_resample(X_2, y_2)
    # concatenate the results
    X_smote = pd.concat([X_smote_1, X_smote_2], ignore_index=True)
    y_smote = pd.concat([y_smote_1, y_smote_2], ignore_index=True)
    print(f"\n Target class distribution before SMOTENC upsampling: {y.value_counts()}")
    print(f"\n Target class distribution after SMOTENC upsampling: {y_smote.value_counts()}")
    print(f"\nNumber of NaN values: {X_smote.isnull().sum().sum()}")

- Target class distribution before SMOTENC upsampling: TARGET
  - 0:    279864
  - 1:     24667
  - Name: count, dtype: int64
- Target class distribution after SMOTENC upsampling: TARGET
  - 0:    279864
  - 1:    279864
  - Name: count, dtype: int64
- Number of NaN values: 0

In [None]:
# optionally save data for easier access
# X_smote.to_csv(data_dir + "X_smote.csv", index=False)
# y_smote.to_csv(data_dir + "y_smote.csv", index=False)
# load smote data
X_smote = pd.read_csv(data_dir + "X_smote.csv")
y_smote = pd.read_csv(data_dir + "y_smote.csv")

In [None]:
# remove the validation data from the upsampled dataset
X_train_smote = X_smote.loc[~X_smote.index.isin(val_idx)]
y_train_smote = y_smote.loc[~y_smote.index.isin(val_idx)]

In [None]:
# train Logistic Regression model on upsampled data and evaluate on original validation data
clf_smote = LogisticRegression(C=0.0001, random_state=0)
clf_smote, metrics_clf_smote, conf_matrix_clf_smote = trainEvaluate(
    X_train=X_train_smote, y_train=y_train_smote, 
    X_val=X_val, y_val=y_val, 
    model=clf_smote, plot_results=True
)

**CAUTION**: the following cell takes $\approx 6$ minutes to run.

In [None]:
# train Random Forest model on upsampled data and evaluate on original validation data
rfc_smote = RandomForestClassifier(n_estimators=100, random_state=0)
rfc_smote, metrics_rfc_smote, conf_matrix_rfc_smote = trainEvaluate(
    X_train=X_train_smote, y_train=y_train_smote, 
    X_val=X_val, y_val=y_val, 
    model=rfc_smote, plot_results=True
)

With the upsampled data the perfomances of both *Logistic Regression* and *Random Forest* models are much better.