In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [81]:
train_df = pd.read_csv('AnalyticsOlympiad2022Data/train.csv')
test_df = pd.read_csv('AnalyticsOlympiad2022Data/test.csv')
submission_df = pd.read_csv('AnalyticsOlympiad2022Data/submission.csv')

## 1. Label encoding for all categorical columns:

(fit_transform is applied for both dataframes)

In [None]:
# Encode labels in column.
def encode_fn(col):
    train_df[col]= label_encoder.fit_transform(train_df[col])
    test_df[col]= label_encoder.fit_transform(test_df[col])

In [None]:
## Label encode all the categorical columns
encode_fn('AGE')
encode_fn('GENDER')
encode_fn('DRIVING_EXPERIENCE')
encode_fn('EDUCATION')
encode_fn('INCOME')
encode_fn('VEHICLE_YEAR')
encode_fn('TYPE_OF_VEHICLE')


## 2. Covariant Shift evaluation


In [82]:
#adding a column to identify whether a row comes from train or not
test_df['is_train'] = 0
train_df['is_train'] = 1 

In [121]:
#combining test and train data


df_combine = pd.concat([train_df[['TYPE_OF_VEHICLE','is_train']], test_df[['TYPE_OF_VEHICLE','is_train']]], axis=0, ignore_index=True)
#dropping ‘target’ column as it is not present in the test

y = df_combine['is_train'].values #labels
x = df_combine.drop('is_train', axis=1).values #covariates or our independent variables

tst, trn = test_df['TYPE_OF_VEHICLE'].values, train_df['TYPE_OF_VEHICLE'].values

In [122]:
m = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y.shape)

In [123]:
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=100)
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y)):
    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    m.fit(X_train, y_train)
    probs = m.predict_proba(X_test)[:, 1] #calculating the probability
    predictions[test_idx] = probs

In [124]:
print('ROC-AUC for train and test distributions:', roc_auc_score(y, predictions))

ROC-AUC for train and test distributions: 0.4973115343915344


#### Inference for covariance shift:
There is no covariance shift between train and test data in all the columns:
    1. DUIS (AUC value: 0.50)
    2. PAST_ACCIDENTS (0.4986)
    3. SPEEDING_VIOLATIONS (0.498)
    4. ANNUAL_MILEAGE (0.4978)
    5. MARRIED (0.49775)
    6. CHILDREN (0.49945)
    7. VEHICLE_OWNERSHIP (0.495)
    8. CREDIT_SCORE (0.50)
    9. ID (0.497795)
    10. POSTAL_CODE (0.495597)
    11. AGE (0.4989)
    12. GENDER (0.4974)
    13. DRIVING_EXPERIENCE (0.499867)
    14. EDUCATION (0.495)
    15. INCOME (0.49577)
    16. VEHICLE_YEAR (0.492)
    17. TYPE_OF_VEHICLE (0.4973)

### But, we need to verify the categorical types in all the categorical columns

## 3. Verify category types for all categorical columns in both datasets

In [126]:
train_df = pd.read_csv('AnalyticsOlympiad2022Data/train.csv')
test_df = pd.read_csv('AnalyticsOlympiad2022Data/test.csv')
submission_df = pd.read_csv('AnalyticsOlympiad2022Data/submission.csv')

In [140]:
train_df['TYPE_OF_VEHICLE'].value_counts(dropna=False)

Sports Car    34592
Sedan         28120
HatchBack     24900
SUV           17388
Name: TYPE_OF_VEHICLE, dtype: int64

In [139]:
test_df['TYPE_OF_VEHICLE'].value_counts(dropna=False)

Sports Car    14811
Sedan         11968
HatchBack     10720
SUV            7501
Name: TYPE_OF_VEHICLE, dtype: int64