# Kaggle - Categorical Feature Encoding Challenge - Baseline
**Author: Chris Shin**

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train = pd.read_csv('./data/train.csv', index_col='id')
test = pd.read_csv('./data/test.csv', index_col='id')
submission = pd.read_csv('./data/sample_submission.csv', index_col='id')

### Feature engineering with train/test combined or separately?

It is generally not recommended to perform feature engineering on the combined train and test data. This is because doing so can lead to data leakage, where the model inadvertently learns patterns or relationships between the features and the target variable that it should not have access to during training.

Instead, it is recommended to perform feature engineering separately on the train and test data sets. This ensures that the model only learns patterns from the training data, and that the test data remains truly unseen until model evaluation.

However, it is important to keep the feature engineering process consistent across both the train and test data sets, to ensure that the model can generalize well to new, unseen data.

To ensure that train and test have the same columns after feature engineering, you can follow these steps:

1. Perform all feature engineering steps on the train and test sets separately.
2. Identify the columns that were created in the train set after feature engineering.
3. Check if these columns exist in the test set after feature engineering. If a column does not exist in the test set, create that column in the test set with all zeros or some default value.
4. Repeat steps 2-3 for any new columns that were created in the test set after feature engineering.
5. Finally, reorder the columns in the test set to match the order of the columns in the train set.

### Feature Engineering

In [3]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import sparse

def feature_engineer(data):
    df = data.copy()
    # Binary Encoding
    df['bin_3'] = df['bin_3'].map({'F':0, 'T':1})
    df['bin_4'] = df['bin_4'].map({'N':0, 'Y':1})

    # Ordinal Encoding
    ord1dict = {'Novice':0, 'Contributor':1, 
            'Expert':2, 'Master':3, 'Grandmaster':4}
    ord2dict = {'Freezing':0, 'Cold':1, 'Warm':2, 
                'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}

    df['ord_1'] = df['ord_1'].map(ord1dict)
    df['ord_2'] = df['ord_2'].map(ord2dict)
    ord_345 = ['ord_3', 'ord_4', 'ord_5']
    ord_encoder = OrdinalEncoder()
    df[ord_345] = ord_encoder.fit_transform(df[ord_345])

    # Norminal Encoding
    nom_features = ['nom_' + str(i) for i in range(6)]
    onehot_encoder = OneHotEncoder()
    encoded_nom_matrix = onehot_encoder.fit_transform(df[nom_features])
    df = df.drop(['nom_' + str(i) for i in range(10)], axis=1)
    
    # Date Encoding
    date_features  = ['day', 'month']
    encoded_date_matrix = onehot_encoder.fit_transform(df[date_features])
    df = df.drop(date_features, axis=1)
    
    # Ordinal features scaling
    ord_features = ['ord_' + str(i) for i in range(6)]
    df[ord_features] = MinMaxScaler().fit_transform(df[ord_features])
    
    df = sparse.hstack([sparse.csr_matrix(df),
                                   encoded_nom_matrix,
                                   encoded_date_matrix],
                                   format='csr')

    return df

In [4]:
X_train = train.drop('target', axis=1)
y_train = train['target']
X_train = feature_engineer(X_train)

In [5]:
X_train

<300000x277 sparse matrix of type '<class 'numpy.float64'>'
	with 4297975 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                      test_size=0.1,
                                                      stratify=y_train,
                                                      random_state=10)

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()

lr_params = {'C':[0.1, 0.125, 0.2], 'max_iter':[800, 900, 1000], 
             'solver':['liblinear'], 'random_state':[42]}

gridsearch_logistic_model = GridSearchCV(estimator=logistic_model,
                                         param_grid=lr_params,
                                         scoring='roc_auc',
                                         cv=5)
gridsearch_logistic_model.fit(X_train, y_train)

print('Optimal parameters:', gridsearch_logistic_model.best_params_)

Optimal parameters: {'C': 0.125, 'max_iter': 800, 'random_state': 42, 'solver': 'liblinear'}


In [8]:
y_valid_preds = gridsearch_logistic_model.predict_proba(X_valid)[:, 1]

In [9]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_valid, y_valid_preds)

print(f'Validation data ROC AUC : {roc_auc:.4f}')

Validation data ROC AUC : 0.7795


In [10]:
X_test = feature_engineer(test)

In [11]:
X_test

<200000x277 sparse matrix of type '<class 'numpy.float64'>'
	with 2865743 stored elements in Compressed Sparse Row format>

In [12]:
y_preds = gridsearch_logistic_model.best_estimator_.predict_proba(X_test)[:,1]

submission['target'] = y_preds
submission.to_csv('submission.csv')