# Kaggle - Categorical Feature Encoding Challenge - Model Improvement
**Author: Chris Shin**

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train = pd.read_csv('./data/train.csv', index_col='id')
test = pd.read_csv('./data/test.csv', index_col='id')
submission = pd.read_csv('./data/sample_submission.csv', index_col='id')

### Feature engineering with train/test combined or separately?

It is generally not recommended to perform feature engineering on the combined train and test data. This is because doing so can lead to data leakage, where the model inadvertently learns patterns or relationships between the features and the target variable that it should not have access to during training.

Instead, it is recommended to perform feature engineering separately on the train and test data sets. This ensures that the model only learns patterns from the training data, and that the test data remains truly unseen until model evaluation.

However, it is important to keep the feature engineering process consistent across both the train and test data sets, to ensure that the model can generalize well to new, unseen data.

To ensure that train and test have the same columns after feature engineering, you can follow these steps:

1. Perform all feature engineering steps on the train and test sets separately.
2. Identify the columns that were created in the train set after feature engineering.
3. Check if these columns exist in the test set after feature engineering. If a column does not exist in the test set, create that column in the test set with all zeros or some default value.
4. Repeat steps 2-3 for any new columns that were created in the test set after feature engineering.
5. Finally, reorder the columns in the test set to match the order of the columns in the train set.

In [3]:
train.dtypes

bin_0      int64
bin_1      int64
bin_2      int64
bin_3     object
bin_4     object
nom_0     object
nom_1     object
nom_2     object
nom_3     object
nom_4     object
nom_5     object
nom_6     object
nom_7     object
nom_8     object
nom_9     object
ord_0      int64
ord_1     object
ord_2     object
ord_3     object
ord_4     object
ord_5     object
day        int64
month      int64
target     int64
dtype: object

In [4]:
test.dtypes

bin_0     int64
bin_1     int64
bin_2     int64
bin_3    object
bin_4    object
nom_0    object
nom_1    object
nom_2    object
nom_3    object
nom_4    object
nom_5    object
nom_6    object
nom_7    object
nom_8    object
nom_9    object
ord_0     int64
ord_1    object
ord_2    object
ord_3    object
ord_4    object
ord_5    object
day       int64
month     int64
dtype: object

### Feature Engineering

In [5]:
pd.options.display.max_columns = 50

In [6]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import sparse

def feature_engineer(data):
    df = data.copy()
    
    # Binary Encoding
    df['bin_3'] = df['bin_3'].map({'F':0, 'T':1})
    df['bin_4'] = df['bin_4'].map({'N':0, 'Y':1})

    # Ordinal Encoding
    ord1dict = {'Novice':0, 'Contributor':1, 
            'Expert':2, 'Master':3, 'Grandmaster':4}
    ord2dict = {'Freezing':0, 'Cold':1, 'Warm':2, 
                'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}

    df['ord_1'] = df['ord_1'].map(ord1dict)
    df['ord_2'] = df['ord_2'].map(ord2dict)
    ord_345 = ['ord_3', 'ord_4', 'ord_5']
    ord_encoder = OrdinalEncoder()
    df[ord_345] = ord_encoder.fit_transform(df[ord_345])


    # create an instance of the encoder with categorical feature indices
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    categorical_cols = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'day', 'month']

    # fit and transform the encoded values to the DataFrame
    encoded_values = encoder.fit_transform(df[categorical_cols])
    feature_names = encoder.get_feature_names_out(input_features=categorical_cols)
    df_encoded = pd.DataFrame(encoded_values, columns=feature_names)
    df_encoded = df_encoded.set_index(df.index)
    # drop original categorical columns and join the encoded DataFrame to the original
    df = df.drop(categorical_cols, axis=1)
    df = pd.concat([df, df_encoded], axis=1)

    # # Norminal Encoding
    # nom_features = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
    # onehot_encoder = OneHotEncoder()
    # encoded_nom_matrix = onehot_encoder.fit_transform(df[nom_features])
    # df = df.drop(nom_features, axis=1)

    # # Date Encoding
    # date_features  = ['day', 'month']
    # encoded_date_matrix = onehot_encoder.fit_transform(df[date_features])
    # df = df.drop(date_features, axis=1)
    
    # Ordinal features scaling
    ord_features = ['ord_' + str(i) for i in range(6)]
    df[ord_features] = MinMaxScaler().fit_transform(df[ord_features])

    hex_df = df.loc[:,"nom_5":"nom_9"]
    display(hex_df)
    hex_1 = lambda x: int(bin(int(x,16))[2:].zfill(36)[:9],2)
    hex_2 = lambda x: int(bin(int(x,16))[2:].zfill(36)[9:18],2)
    hex_3 = lambda x: int(bin(int(x,16))[2:].zfill(36)[18:27],2)
    hex_4 = lambda x: int(bin(int(x,16))[2:].zfill(36)[27:],2)
    new_ord_df = pd.DataFrame()
    for col in hex_df:
        new_ord_df['%s_1'%col] = hex_df[col].apply(hex_1)
        new_ord_df['%s_2'%col] = hex_df[col].apply(hex_2)
        new_ord_df['%s_3'%col] = hex_df[col].apply(hex_3)
        new_ord_df['%s_4'%col] = hex_df[col].apply(hex_4)
    df.drop(hex_df.columns,axis=1,inplace=True)
    new_ord_df = new_ord_df.set_index(hex_df.index)

    df = pd.concat([df, new_ord_df],axis=1)
    display(df)
    # display(encoded_nom_matrix)
    # display(encoded_date_matrix)
    # df = sparse.hstack([sparse.csr_matrix(df),
    #                                encoded_nom_matrix,
    #                                encoded_date_matrix],
    #                                format='csr')

    return df

In [7]:
X_train = train.drop('target', axis=1)
y_train = train['target']
X_train = feature_engineer(X_train)



Unnamed: 0_level_0,nom_5,nom_6,nom_7,nom_8,nom_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51
1,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21
2,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0
3,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71
4,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7
...,...,...,...,...,...
299995,35f65a9bf,788ba7aea,86a8e4ca0,7508f4ef1,e027decef
299996,472efea17,3b9693870,c4455f4a8,397dd0274,80f1411c8
299997,0dee9b39a,6046454de,ba9901303,5d7806f53,314dcc15b
299998,e1558b071,0000ee65f,c8ae4ea14,1f820c7ce,ab0ce192b


Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_2_Axolotl,nom_2_Cat,nom_2_Dog,nom_2_Hamster,nom_2_Lion,...,month_8,month_9,month_10,month_11,month_12,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_6_1,nom_6_2,nom_6_3,nom_6_4,nom_7_1,nom_7_2,nom_7_3,nom_7_4,nom_8_1,nom_8_2,nom_8_3,nom_8_4,nom_9_1,nom_9_2,nom_9_3,nom_9_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
0,0,0,0,1,1,0.5,1.00,0.2,0.500000,0.12,0.712042,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,161,452,181,463,117,262,452,20,209,474,361,489,391,36,0,171,94,306,414,337
1,0,1,0,1,1,0.0,1.00,0.6,0.000000,0.00,0.486911,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,359,211,146,464,503,301,135,449,118,439,171,18,153,356,257,81,496,241,182,33
2,0,0,0,0,1,0.0,0.50,1.0,0.500000,0.68,0.162304,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,100,398,494,229,18,139,286,184,333,141,378,295,445,114,251,132,348,416,6,464
3,0,1,0,0,1,0.0,1.00,0.8,0.571429,0.12,0.701571,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,482,137,50,402,161,350,362,106,472,420,283,235,149,377,341,361,260,451,390,369
4,0,0,0,0,0,0.0,1.00,0.0,0.000000,0.68,0.827225,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,182,61,214,213,63,389,464,509,9,374,353,190,406,270,344,373,354,402,441,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,0,1,0,0.0,0.25,0.0,0.714286,0.40,0.565445,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,107,473,212,447,241,46,317,234,269,163,294,160,234,35,423,241,448,159,246,239
299996,0,0,0,0,1,0.5,0.00,0.0,0.500000,0.88,0.209424,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,142,187,501,23,119,90,156,112,392,277,250,168,114,503,129,116,257,453,8,456
299997,0,0,0,0,1,1.0,0.00,0.8,1.000000,0.00,0.036649,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,27,442,217,410,192,281,42,222,373,100,9,259,186,480,55,339,98,311,96,347
299998,0,1,0,0,1,0.0,0.75,0.8,0.500000,0.88,0.910995,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,450,342,88,113,0,3,371,95,401,185,117,20,63,8,99,462,342,51,268,299


In [8]:
X_train

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_2_Axolotl,nom_2_Cat,nom_2_Dog,nom_2_Hamster,nom_2_Lion,...,month_8,month_9,month_10,month_11,month_12,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_6_1,nom_6_2,nom_6_3,nom_6_4,nom_7_1,nom_7_2,nom_7_3,nom_7_4,nom_8_1,nom_8_2,nom_8_3,nom_8_4,nom_9_1,nom_9_2,nom_9_3,nom_9_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
0,0,0,0,1,1,0.5,1.00,0.2,0.500000,0.12,0.712042,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,161,452,181,463,117,262,452,20,209,474,361,489,391,36,0,171,94,306,414,337
1,0,1,0,1,1,0.0,1.00,0.6,0.000000,0.00,0.486911,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,359,211,146,464,503,301,135,449,118,439,171,18,153,356,257,81,496,241,182,33
2,0,0,0,0,1,0.0,0.50,1.0,0.500000,0.68,0.162304,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,100,398,494,229,18,139,286,184,333,141,378,295,445,114,251,132,348,416,6,464
3,0,1,0,0,1,0.0,1.00,0.8,0.571429,0.12,0.701571,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,482,137,50,402,161,350,362,106,472,420,283,235,149,377,341,361,260,451,390,369
4,0,0,0,0,0,0.0,1.00,0.0,0.000000,0.68,0.827225,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,182,61,214,213,63,389,464,509,9,374,353,190,406,270,344,373,354,402,441,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0,0,0,1,0,0.0,0.25,0.0,0.714286,0.40,0.565445,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,107,473,212,447,241,46,317,234,269,163,294,160,234,35,423,241,448,159,246,239
299996,0,0,0,0,1,0.5,0.00,0.0,0.500000,0.88,0.209424,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,142,187,501,23,119,90,156,112,392,277,250,168,114,503,129,116,257,453,8,456
299997,0,0,0,0,1,1.0,0.00,0.8,1.000000,0.00,0.036649,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,27,442,217,410,192,281,42,222,373,100,9,259,186,480,55,339,98,311,96,347
299998,0,1,0,0,1,0.0,0.75,0.8,0.500000,0.88,0.910995,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,450,342,88,113,0,3,371,95,401,185,117,20,63,8,99,462,342,51,268,299


There are several resampling techniques for imbalanced data, including:

1. Random Under-Sampling: This involves randomly removing samples from the majority class to balance the dataset. The main advantage is that it can be fast, but the downside is that it can lead to a loss of information.

2. Random Over-Sampling: This involves randomly duplicating samples from the minority class to balance the dataset. The main advantage is that it can be fast, but the downside is that it can lead to overfitting.

3. Synthetic Minority Over-Sampling Technique (SMOTE): This involves generating new synthetic samples from the minority class to balance the dataset. The main advantage is that it preserves information from the minority class, but the downside is that it can lead to overfitting.

4. Adaptive Synthetic Sampling (ADASYN): This is an extension of SMOTE that generates more synthetic samples for harder-to-learn minority class examples. The main advantage is that it can handle more complex datasets, but the downside is that it can be slower.

5. Tomek Links: This is a method that identifies pairs of nearest neighbors from different classes and removes the majority class samples. The main advantage is that it can improve the decision boundary, but the downside is that it can lead to a loss of information.

6. Edited Nearest Neighbors (ENN): This is a method that removes examples from the majority class that are misclassified by their nearest neighbors from the minority class. The main advantage is that it can improve the decision boundary, but the downside is that it can lead to a loss of information.

7. Combination of Over-Sampling and Under-Sampling (SMOTEENN and SMOTETomek): These are hybrid methods that combine SMOTE with either ENN or Tomek Links to remove noise and balance the dataset.

The choice of resampling technique depends on the specific dataset and the machine learning algorithm being used. Random Under-Sampling and Random Over-Sampling can be useful when there is a large amount of data, while SMOTE and ADASYN can be more effective when there is a smaller amount of data. Tomek Links and ENN can be useful when there is a clear boundary between classes, while combination methods can be useful when there is noise in the dataset. It is important to evaluate the performance of the model using different resampling techniques and choose the one that results in the best performance.

Undersampling techniques can be useful to remove some of the majority class samples, making the class distribution more balanced. However, it may also lead to loss of useful information, especially if the dataset is already limited.

Oversampling techniques can be used to increase the number of minority class samples, making the class distribution more balanced. However, if not done carefully, it can also lead to overfitting and poor generalization performance.

A combination of these techniques can be a good approach to balance the class distribution and preserve important information. For example, you can try undersampling the majority class and then oversampling the minority class.

In general, the choice of sampling technique depends on the specific characteristics of the dataset and the problem at hand. It is important to carefully evaluate the performance of different sampling techniques and choose the one that works best for your problem.

There are several techniques that are recommended to use together to handle imbalanced data:

1. Stratified sampling with resampling: This technique combines the benefits of stratification and resampling to create a more balanced dataset for training. The stratified sampling ensures that the class distribution is preserved in the subsample, and the resampling method creates more balanced classes by either oversampling the minority class or undersampling the majority class.

2. Ensemble methods with resampling: Ensemble methods such as Random Forest and XGBoost can be combined with resampling techniques to create more robust and accurate models. By resampling the data and training multiple models on the different subsamples, the ensemble can take advantage of the strengths of each model while reducing the risk of overfitting to the minority class.

3. Feature selection with resampling: Feature selection can help to identify the most important features for predicting the minority class. By combining feature selection with resampling techniques, it is possible to create more accurate models with fewer features. This can improve the interpretability of the model while reducing the risk of overfitting to the minority class.

4. Model tuning with resampling: Model tuning can help to identify the optimal hyperparameters for a given model. By combining model tuning with resampling techniques, it is possible to create more accurate models that are better tuned to the minority class.

Overall, there is no one-size-fits-all solution for handling imbalanced data. It is important to experiment with different techniques and combinations of techniques to find the approach that works best for a particular dataset and problem.

Some examples : 
1. SMOTE with stratified k-fold cross-validation: SMOTE can be used to generate synthetic samples of minority classes, while stratified k-fold cross-validation can be used to ensure that the same class distribution is maintained across different folds.

2. Random undersampling with ensemble methods: Random undersampling can be used to reduce the number of majority class samples, while ensemble methods like bagging and boosting can be used to create multiple models on different subsamples of the training data.

3. ADASYN with cost-sensitive learning: ADASYN can be used to generate synthetic samples of minority classes, while cost-sensitive learning can be used to assign different costs to different types of classification errors, based on the relative importance of each class.

It is important to note that the choice of techniques and their combination depends on the specific problem and data at hand. It is recommended to try different techniques and combinations and evaluate their performance to determine the most effective approach.

In [9]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [10]:
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler

# # Perform undersampling on the majority class
# under_sampler = RandomUnderSampler(random_state=42)
# X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

# # Use SMOTE on the resulting data
# smote = SMOTE(random_state=42)
# X_train_res, y_train_res = smote.fit_resample(X_train_under, y_train_under)

### Using stratified sampling technique in combination with other sampling techniques

Using stratified sampling technique in combination with other sampling techniques can be a good approach for imbalanced classification problems.

Stratified sampling helps to ensure that the class distribution in the sample is representative of the class distribution in the population, which is particularly important for imbalanced datasets. By using stratified sampling, you can ensure that the rare class (in your case, target 1) is represented in the training data.

In addition to stratified sampling, you can also consider using other sampling techniques such as oversampling or undersampling to further balance the class distribution. For example, you could oversample the minority class (target 1) using techniques such as SMOTE or ADASYN, or undersample the majority class (target 0) using techniques such as random undersampling or Tomek links.

Ultimately, the choice of sampling technique will depend on the specifics of your dataset and the performance of the different techniques on your problem. It's a good idea to experiment with different techniques and evaluate their performance using appropriate metrics such as precision, recall, and F1-score.

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                      test_size=0.1,
                                                      stratify=y_train,
                                                      random_state=10)

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()

lr_params = {'C':[0.1, 0.125, 0.2], 'max_iter':[800, 900, 1000], 
             'solver':['liblinear'], 'random_state':[42]}

gridsearch_logistic_model = GridSearchCV(estimator=logistic_model,
                                         param_grid=lr_params,
                                         scoring='roc_auc',
                                         cv=5)
gridsearch_logistic_model.fit(X_train, y_train)

print('Optimal parameters:', gridsearch_logistic_model.best_params_)

Optimal parameters: {'C': 0.125, 'max_iter': 800, 'random_state': 42, 'solver': 'liblinear'}


In [13]:
y_valid_preds = gridsearch_logistic_model.predict_proba(X_valid)[:, 1]

In [14]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_valid, y_valid_preds)

print(f'Validation data ROC AUC : {roc_auc:.4f}')

Validation data ROC AUC : 0.7703


In [15]:
test

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
300000,0,0,1,T,Y,Blue,Triangle,Axolotl,Finland,Piano,0870b0a5d,9ceb19dd6,530f8ecc3,9d117320c,3c49b42b8,2,Novice,Warm,j,P,be,5,11
300001,0,0,0,T,N,Red,Square,Lion,Canada,Piano,a5c276589,1ad744242,12e6161c9,46ae3059c,285771075,1,Master,Lava Hot,l,A,RP,7,5
300002,1,0,1,F,Y,Blue,Square,Dog,China,Piano,568550f04,1fe17a1fd,27d6df03f,b759e21f0,6f323c53f,2,Expert,Freezing,a,G,tP,1,12
300003,0,0,1,T,Y,Red,Star,Cat,China,Piano,c5725677e,a6542cec0,30c63bd0c,0b6ec68ff,b5de3dcc4,1,Contributor,Lava Hot,b,Q,ke,2,3
300004,0,1,1,F,N,Red,Trapezoid,Dog,China,Piano,e70a6270d,97b6a3518,a42386065,f91f3b1ee,967cfa9c9,3,Grandmaster,Lava Hot,l,W,qK,4,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0,0,0,F,N,Green,Square,Lion,Canada,Theremin,6c90f81cd,a406a5f12,d02a6b0ba,9e4b23160,acc31291f,1,Novice,Lava Hot,j,A,Gb,1,3
499996,1,0,0,F,Y,Green,Trapezoid,Lion,China,Piano,2d61990e2,520806ce2,d2d8eabdb,cfbd87ed0,eae3446d0,1,Contributor,Lava Hot,f,S,Ed,2,2
499997,0,1,1,T,Y,Green,Trapezoid,Lion,Canada,Oboe,488406659,28645754b,22831fffe,1108bcd6c,33dd3cf4b,1,Novice,Boiling Hot,g,V,TR,3,1
499998,1,0,0,T,Y,Blue,Star,Hamster,Costa Rica,Bassoon,f9d17bb93,2eadb68c5,fef807a3e,606ac930b,d4cf587dd,2,Grandmaster,Boiling Hot,g,X,Ye,2,1


In [16]:
X_test = feature_engineer(test)



Unnamed: 0_level_0,nom_5,nom_6,nom_7,nom_8,nom_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
300000,0870b0a5d,9ceb19dd6,530f8ecc3,9d117320c,3c49b42b8
300001,a5c276589,1ad744242,12e6161c9,46ae3059c,285771075
300002,568550f04,1fe17a1fd,27d6df03f,b759e21f0,6f323c53f
300003,c5725677e,a6542cec0,30c63bd0c,0b6ec68ff,b5de3dcc4
300004,e70a6270d,97b6a3518,a42386065,f91f3b1ee,967cfa9c9
...,...,...,...,...,...
499995,6c90f81cd,a406a5f12,d02a6b0ba,9e4b23160,acc31291f
499996,2d61990e2,520806ce2,d2d8eabdb,cfbd87ed0,eae3446d0
499997,488406659,28645754b,22831fffe,1108bcd6c,33dd3cf4b
499998,f9d17bb93,2eadb68c5,fef807a3e,606ac930b,d4cf587dd


Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_2_Axolotl,nom_2_Cat,nom_2_Dog,nom_2_Hamster,nom_2_Lion,...,month_8,month_9,month_10,month_11,month_12,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_6_1,nom_6_2,nom_6_3,nom_6_4,nom_7_1,nom_7_2,nom_7_3,nom_7_4,nom_8_1,nom_8_2,nom_8_3,nom_8_4,nom_9_1,nom_9_2,nom_9_3,nom_9_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
300000,0,0,1,1,1,0.5,0.00,0.4,0.642857,0.60,0.497382,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,16,450,389,93,313,428,206,470,166,62,118,195,314,69,409,12,120,294,417,184
300001,0,0,0,1,0,0.0,0.75,1.0,0.785714,0.00,0.319372,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,331,265,434,393,53,349,33,66,37,408,176,457,141,184,386,412,80,349,392,117
300002,1,0,1,0,1,0.5,0.50,0.0,0.000000,0.24,0.900524,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,173,21,135,260,63,389,464,509,79,347,248,63,366,359,272,496,222,200,482,319
300003,0,0,1,1,1,0.0,0.25,1.0,0.071429,0.64,0.706806,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,394,457,179,382,332,336,359,192,97,280,478,268,22,443,52,255,363,376,494,196
300004,0,1,1,0,0,1.0,1.00,1.0,0.785714,0.88,0.821990,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,462,41,275,269,303,218,282,280,328,142,48,101,498,124,472,494,300,499,468,457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0,0,0,0,0,0.0,0.00,1.0,0.642857,0.00,0.130890,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,217,67,448,461,328,26,303,274,416,169,344,186,316,300,280,352,345,268,148,287
499996,1,0,0,0,1,0.0,0.25,1.0,0.357143,0.72,0.089005,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,90,390,200,226,164,32,54,226,421,355,341,475,415,246,63,208,469,397,35,208
499997,0,1,1,1,1,0.0,0.00,0.8,0.428571,0.84,0.345550,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,145,16,51,89,80,401,186,331,69,12,255,510,34,34,486,364,103,372,487,331
499998,1,0,0,1,1,0.5,1.00,0.8,0.428571,0.92,0.429319,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,499,325,477,403,93,182,436,197,509,480,61,62,192,427,73,267,425,317,195,477


In [17]:
X_test

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,nom_0_Blue,nom_0_Green,nom_0_Red,nom_1_Circle,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_2_Axolotl,nom_2_Cat,nom_2_Dog,nom_2_Hamster,nom_2_Lion,...,month_8,month_9,month_10,month_11,month_12,nom_5_1,nom_5_2,nom_5_3,nom_5_4,nom_6_1,nom_6_2,nom_6_3,nom_6_4,nom_7_1,nom_7_2,nom_7_3,nom_7_4,nom_8_1,nom_8_2,nom_8_3,nom_8_4,nom_9_1,nom_9_2,nom_9_3,nom_9_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
300000,0,0,1,1,1,0.5,0.00,0.4,0.642857,0.60,0.497382,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,16,450,389,93,313,428,206,470,166,62,118,195,314,69,409,12,120,294,417,184
300001,0,0,0,1,0,0.0,0.75,1.0,0.785714,0.00,0.319372,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,331,265,434,393,53,349,33,66,37,408,176,457,141,184,386,412,80,349,392,117
300002,1,0,1,0,1,0.5,0.50,0.0,0.000000,0.24,0.900524,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,173,21,135,260,63,389,464,509,79,347,248,63,366,359,272,496,222,200,482,319
300003,0,0,1,1,1,0.0,0.25,1.0,0.071429,0.64,0.706806,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,394,457,179,382,332,336,359,192,97,280,478,268,22,443,52,255,363,376,494,196
300004,0,1,1,0,0,1.0,1.00,1.0,0.785714,0.88,0.821990,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,462,41,275,269,303,218,282,280,328,142,48,101,498,124,472,494,300,499,468,457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0,0,0,0,0,0.0,0.00,1.0,0.642857,0.00,0.130890,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,217,67,448,461,328,26,303,274,416,169,344,186,316,300,280,352,345,268,148,287
499996,1,0,0,0,1,0.0,0.25,1.0,0.357143,0.72,0.089005,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,90,390,200,226,164,32,54,226,421,355,341,475,415,246,63,208,469,397,35,208
499997,0,1,1,1,1,0.0,0.00,0.8,0.428571,0.84,0.345550,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,145,16,51,89,80,401,186,331,69,12,255,510,34,34,486,364,103,372,487,331
499998,1,0,0,1,1,0.5,1.00,0.8,0.428571,0.92,0.429319,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,499,325,477,403,93,182,436,197,509,480,61,62,192,427,73,267,425,317,195,477


In [18]:
y_preds = gridsearch_logistic_model.best_estimator_.predict_proba(X_test)[:,1]

submission['target'] = y_preds
submission.to_csv('submission.csv')

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

# define the classifier
rf = RandomForestClassifier()

# define the parameter distribution to sample from
param_dist = {
    'n_estimators': sp_randint(50, 200),
    'max_depth': sp_randint(3, 20),
    'min_samples_split': sp_randint(2, 20),
    'min_samples_leaf': sp_randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

# run the randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=n_iter_search, cv=5)

# fit the model on the training data
random_search.fit(X_train, y_train)

In [20]:
y_valid_preds = random_search.predict_proba(X_valid)[:, 1]

In [21]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_valid, y_valid_preds)

print(f'Validation data ROC AUC : {roc_auc:.4f}')

Validation data ROC AUC : 0.7385


In [22]:
# make predictions on the test data
y_pred = random_search.predict(X_test)