In [2]:
import pandas as pd
import numpy as np

In [3]:
train_df = pd.read_csv('AnalyticsOlympiad2022Data/train.csv')
test_df = pd.read_csv('AnalyticsOlympiad2022Data/test.csv')
submission_df = pd.read_csv('AnalyticsOlympiad2022Data/submission.csv')

## SECTION 1: DATA UNDERSTANDING

In [4]:
# Display top 5 records 
train_df.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME,TYPE_OF_VEHICLE
0,816393,40-64,female,20-29y,university,middle class,0.63805,0.0,after 2015,0.0,0.0,37379,11000.0,0,0,0,0.0,Sports Car
1,251762,26-39,male,20-29y,high school,middle class,0.475741,1.0,before 2015,1.0,0.0,10238,9000.0,0,0,0,1.0,HatchBack
2,481952,40-64,male,20-29y,none,middle class,0.839817,1.0,before 2015,1.0,1.0,10238,12000.0,0,0,0,1.0,Sedan
3,3506,40-64,male,20-29y,high school,upper class,0.682527,1.0,before 2015,0.0,1.0,92099,6000.0,1,0,0,1.0,Sedan
4,498013,40-64,female,20-29y,none,working class,0.572184,1.0,after 2015,1.0,1.0,32122,15000.0,0,0,1,0.0,Sedan


In [5]:
# Check the dataframe rows & columns
print("Train", train_df.shape)
print("Test", test_df.shape)

Train (105000, 18)
Test (45000, 17)


There are 18 columns and approximately 1 lakh records

In [6]:
# Check for datatypes and nulls
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105000 entries, 0 to 104999
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   105000 non-null  int64  
 1   AGE                  105000 non-null  object 
 2   GENDER               105000 non-null  object 
 3   DRIVING_EXPERIENCE   105000 non-null  object 
 4   EDUCATION            105000 non-null  object 
 5   INCOME               105000 non-null  object 
 6   CREDIT_SCORE         105000 non-null  float64
 7   VEHICLE_OWNERSHIP    105000 non-null  float64
 8   VEHICLE_YEAR         105000 non-null  object 
 9   MARRIED              105000 non-null  float64
 10  CHILDREN             105000 non-null  float64
 11  POSTAL_CODE          105000 non-null  int64  
 12  ANNUAL_MILEAGE       105000 non-null  float64
 13  SPEEDING_VIOLATIONS  105000 non-null  int64  
 14  DUIS                 105000 non-null  int64  
 15  PAST_ACCIDENTS   

In [7]:
# 1. Check for duplicates in ID column
# 2. Check for age group, disguised nulls, outliers, skewness
# 3. Check for disguised nulls in Gender column, skewness, perform dummy variable creation for this column
# 4. Driving_experience: Check for disguised nulls, is this numerical categorical variable? How to handle this column? Also, skew
# 5. education: Check for disguised nulls, label encode this columns (ordinal categorical), outliers and skew check
# 6. Credit score: Check for disguised nulls, outliers and skew check
# 7. Vehicle_ownership: convert to int , check for disguised nulls, outliers and skew check
# 8. vehicle_year: how to handle vehicle year , check for disguised nulls, outliers and skew check
# 9. Married: convert to int, check for disguised nulls, outliers and skew check
# 10. children: convert to int, outliers and skew check
# 11. postal_code: check and map the postal codes (5 digit postal code)
# 12. annual_mileage: can we bin this variable, convert to int
# 13. speeding_violations: Should we check for skewness and outliers, check for disguised records
# 14. duis: what is the meaning of this?
# 15. past_accidents:
# 16. outcome: target variable, check for class imbalance
# 17. type_of_vehicle: any feature that can be derived? 


In [8]:
# Check if train_df and test_df has same number of columns
train_cols = list(train_df.columns)
test_cols = list(test_df.columns)
train_cols.remove('OUTCOME')
sorted(train_cols) == sorted(test_cols)

True

In [9]:
# Check the descriptive statistics
train_df.describe()

Unnamed: 0,ID,CREDIT_SCORE,VEHICLE_OWNERSHIP,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME
count,105000.0,105000.0,105000.0,105000.0,105000.0,105000.0,105000.0,105000.0,105000.0,105000.0,105000.0
mean,394931.295905,0.602173,0.827038,0.584143,0.52,18045.439267,11061.228571,0.675676,0.129733,0.549314,0.422648
std,279694.106053,0.138045,0.378216,0.492871,0.499602,16709.040449,2972.355482,1.383678,0.589714,1.402809,0.493983
min,101.0,0.06688,0.0,0.0,0.0,10238.0,2000.0,0.0,0.0,0.0,0.0
25%,156351.75,0.514876,1.0,0.0,0.0,10238.0,9000.0,0.0,0.0,0.0,0.0
50%,354679.5,0.601112,1.0,1.0,1.0,10238.0,11000.0,0.0,0.0,0.0,0.0
75%,598602.25,0.703216,1.0,1.0,1.0,22957.75,13000.0,1.0,0.0,0.0,1.0
max,999976.0,0.954075,1.0,1.0,1.0,92101.0,21000.0,20.0,6.0,15.0,1.0


## SECTION 2: DATA UNDERSTANDING

In [10]:
# check for duplicates
dup_rows = train_df[train_df.duplicated(subset=['ID'],keep='last')]
dup_rows.shape


(6515, 18)

In [11]:
train_df.head()

Unnamed: 0,ID,AGE,GENDER,DRIVING_EXPERIENCE,EDUCATION,INCOME,CREDIT_SCORE,VEHICLE_OWNERSHIP,VEHICLE_YEAR,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,OUTCOME,TYPE_OF_VEHICLE
0,816393,40-64,female,20-29y,university,middle class,0.63805,0.0,after 2015,0.0,0.0,37379,11000.0,0,0,0,0.0,Sports Car
1,251762,26-39,male,20-29y,high school,middle class,0.475741,1.0,before 2015,1.0,0.0,10238,9000.0,0,0,0,1.0,HatchBack
2,481952,40-64,male,20-29y,none,middle class,0.839817,1.0,before 2015,1.0,1.0,10238,12000.0,0,0,0,1.0,Sedan
3,3506,40-64,male,20-29y,high school,upper class,0.682527,1.0,before 2015,0.0,1.0,92099,6000.0,1,0,0,1.0,Sedan
4,498013,40-64,female,20-29y,none,working class,0.572184,1.0,after 2015,1.0,1.0,32122,15000.0,0,0,1,0.0,Sedan


In [12]:
class textProperty(object):
    bold = "\033[1m"
    end = "\033[0m"
    
# Classify Numerical vs Categorical Columns 
num_cols = train_df.describe().columns
cat_cols = set(train_df.columns) - set(train_df.describe().columns)
 
print(textProperty.bold + "Numerical Columns:\n" + textProperty.end + "\n" + ', '.join(num_cols) + "\n")
print(textProperty.bold + "Categorical Columns:\n" + textProperty.end + "\n" + ', '.join(cat_cols))

[1mNumerical Columns:
[0m
ID, CREDIT_SCORE, VEHICLE_OWNERSHIP, MARRIED, CHILDREN, POSTAL_CODE, ANNUAL_MILEAGE, SPEEDING_VIOLATIONS, DUIS, PAST_ACCIDENTS, OUTCOME

[1mCategorical Columns:
[0m
TYPE_OF_VEHICLE, AGE, GENDER, DRIVING_EXPERIENCE, VEHICLE_YEAR, INCOME, EDUCATION


In [13]:
train_df.isnull().mean()

ID                     0.0
AGE                    0.0
GENDER                 0.0
DRIVING_EXPERIENCE     0.0
EDUCATION              0.0
INCOME                 0.0
CREDIT_SCORE           0.0
VEHICLE_OWNERSHIP      0.0
VEHICLE_YEAR           0.0
MARRIED                0.0
CHILDREN               0.0
POSTAL_CODE            0.0
ANNUAL_MILEAGE         0.0
SPEEDING_VIOLATIONS    0.0
DUIS                   0.0
PAST_ACCIDENTS         0.0
OUTCOME                0.0
TYPE_OF_VEHICLE        0.0
dtype: float64

In [14]:
# Vehicle_ownership
train_df['GENDER'].value_counts(dropna=False)

male      65317
female    39683
Name: GENDER, dtype: int64

In [15]:
train_df['EDUCATION'].value_counts(dropna=False) # Deal this column

high school    46590
university     31220
none           27190
Name: EDUCATION, dtype: int64

In [16]:
train_df['DRIVING_EXPERIENCE'].value_counts(dropna=False)

20-29y    37493
0-9y      33111
10-19y    25101
30y+       9295
Name: DRIVING_EXPERIENCE, dtype: int64

In [17]:
train_df['AGE'].value_counts(dropna=False)

40-64    33716
65+      32962
26-39    20018
16-25    18304
Name: AGE, dtype: int64

In [18]:
train_df['INCOME'].value_counts(dropna=False) # Is data skewed. but is it obvious?

upper class      51271
working class    24454
middle class     14738
poverty          14537
Name: INCOME, dtype: int64

In [19]:
train_df['TYPE_OF_VEHICLE'].value_counts(dropna=False)

Sports Car    34592
Sedan         28120
HatchBack     24900
SUV           17388
Name: TYPE_OF_VEHICLE, dtype: int64

In [20]:
train_df['VEHICLE_YEAR'].value_counts(dropna=False)

before 2015    57511
after 2015     47489
Name: VEHICLE_YEAR, dtype: int64

In [21]:
train_df['MARRIED'].value_counts(dropna=False)

1.0    61335
0.0    43665
Name: MARRIED, dtype: int64

In [22]:
train_df['CHILDREN'].value_counts(dropna=False)

1.0    54600
0.0    50400
Name: CHILDREN, dtype: int64

In [23]:
train_df['VEHICLE_OWNERSHIP'].value_counts(dropna=False) #tHIS DATA IS HIGHLY SKEWED. DEAL WITH THIS COLUMN

1.0    86839
0.0    18161
Name: VEHICLE_OWNERSHIP, dtype: int64

In [24]:
train_df['ANNUAL_MILEAGE'].value_counts(dropna=False) # Requires further analysis

14000.0    16266
10000.0    16205
13000.0    15750
9000.0     11986
11000.0    10474
12000.0     8564
7000.0      6043
15000.0     5191
6000.0      3652
8000.0      3160
16000.0     2365
3000.0      1515
4000.0      1180
5000.0       921
17000.0      620
2000.0       552
18000.0      315
19000.0      158
20000.0       78
21000.0        5
Name: ANNUAL_MILEAGE, dtype: int64

In [25]:
train_df['SPEEDING_VIOLATIONS'].value_counts(dropna=False) # Requires further analysis

0     69163
1     19867
2      8749
3      2990
4      1688
5       911
6       483
7       351
8       260
9       172
10      109
11       87
13       58
12       56
14       25
15       15
16        7
17        4
18        3
20        1
19        1
Name: SPEEDING_VIOLATIONS, dtype: int64

In [26]:
# Requires further analysis
train_df['DUIS'].value_counts(dropna=False)

0    98246
1     3430
2     1219
3     1069
4      709
5      251
6       76
Name: DUIS, dtype: int64

In [27]:
# Requires further analysis
train_df['PAST_ACCIDENTS'].value_counts(dropna=False)

0     78853
1     13762
2      6454
3      1749
4      1317
5       906
6       534
7       413
8       340
9       236
10      157
11       92
12       75
13       50
14       38
15       24
Name: PAST_ACCIDENTS, dtype: int64

In [28]:

train_df['OUTCOME'].value_counts(dropna=False)

0.0    60622
1.0    44378
Name: OUTCOME, dtype: int64

In [29]:
# AGE, INCOME, GENDER, EDUCATION, DRIVING_EXPERIENCE, TYPE_OF_VEHICLE, VEHICLE_YEAR

In [30]:
# Numerical continuous variable - CREDIT_SCORE
# ID,CREDIT_SCORE, POSTAL_CODE

NameError: name 'ID' is not defined

In [None]:
train_df.head()

In [None]:
# Plot to understand the distribution in LotFrontage column
import matplotlib.pyplot as plt
import seaborn as sns

# Global variable declaration
title_font = {'family': 'Serif', 'color': 'darkblue', 'size': 20,}
label_font = {'family': 'monospace', 'color': 'brown', 'size': 16,}

# Numerical variables analysis for outliers
num_cols = train_df.select_dtypes(exclude='object').columns.tolist()
num_cols.remove('MARRIED')
num_cols.remove('CHILDREN')
num_cols.remove('OUTCOME')
num_cols.remove('VEHICLE_OWNERSHIP')

num_cols.remove('PAST_ACCIDENTS')
num_cols.remove('SPEEDING_VIOLATIONS')
num_cols.remove('DUIS')
num_cols.remove('POSTAL_CODE')
fig = plt.figure(figsize=(15,20))
rows = int((len(num_cols)/2)+1)
cols = 2
inc = 0
for col in num_cols:
    inc = inc + 1
    ax = fig.add_subplot(rows,cols,inc)
    ax.set_title(col,fontsize=10)
    sns.boxplot(train_df[col])
    ax.tick_params(axis='both')
    ax.set_xlabel('')

INFERENCE:
    1. ANNUAL_MILEAGE RANGE: 2000 to 21000  (outliers in 2000 and 20000,21000 range)
    2. 
    
NOMINAL: gender, 
    
ORDINAL:
age, driving_experience, education, income

CONTINUOUS:
CREDIT_SCORE, ANNUAL_MILEAGE

BINARY DISCRETE NUMERICAL:
VEHICLE_OWNERSHIP, MARRIED, CHILDREN

NON-BINARY DISCRETE NUMERICAL:
SPEEDING_VIOLATIONS, DUIS, PAST_ACCIDENTS,  

POSTAL_CODE, ID ??

In [None]:
# Check for normality
def checknormality(col_name,title_txt):
    """
    Function to check normality for column
    
    Args:
        col_name: Column name from the given dataset
        
    Returns:
        None
    """
    plt.figure(figsize=(10,7))
    plt.title(title_txt,title_font)
    plt.xlabel(col_name, label_font)
    sns.distplot(train_df[col_name],color = 'orange')
    plt.show()

In [None]:
# Plot to understand the distribution in income column for normality
checknormality("POSTAL_CODE","PP: Data Distribution")

In [None]:
def analyze_skew(col_lst):
    """
    Function to analyze the values present in different columns
    
    Args: 
        col_lst: List of columns to analyze the unique values present in them
        
    Returns: 
        None
    """
    for col in col_lst: 
        records = train_df[col].value_counts(dropna=False,normalize=True).mul(100).round(2).sort_values(ascending=False)
        display(records.to_frame().T)

In [None]:
analyze_skew(train_df.columns)

In [None]:
train_df.head()

### DUMMY VARIABLES FOR TRAIN & TEST_DF

In [31]:
dum_cols = ['INCOME', 'AGE','EDUCATION','DRIVING_EXPERIENCE','TYPE_OF_VEHICLE','GENDER','VEHICLE_YEAR']
df_cat_cols = pd.get_dummies(data=train_df[dum_cols], columns= dum_cols,drop_first = True)
df_cat_cols.head()

Unnamed: 0,INCOME_poverty,INCOME_upper class,INCOME_working class,AGE_26-39,AGE_40-64,AGE_65+,EDUCATION_none,EDUCATION_university,DRIVING_EXPERIENCE_10-19y,DRIVING_EXPERIENCE_20-29y,DRIVING_EXPERIENCE_30y+,TYPE_OF_VEHICLE_SUV,TYPE_OF_VEHICLE_Sedan,TYPE_OF_VEHICLE_Sports Car,GENDER_male,VEHICLE_YEAR_before 2015
0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0
1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1
2,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,1
3,0,1,0,0,1,0,0,0,0,1,0,0,1,0,1,1
4,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0


In [32]:
dum_test_df = pd.get_dummies(data=test_df[dum_cols], columns= dum_cols,drop_first = True)
dum_test_df.head()

Unnamed: 0,INCOME_poverty,INCOME_upper class,INCOME_working class,AGE_26-39,AGE_40-64,AGE_65+,EDUCATION_none,EDUCATION_university,DRIVING_EXPERIENCE_10-19y,DRIVING_EXPERIENCE_20-29y,DRIVING_EXPERIENCE_30y+,TYPE_OF_VEHICLE_SUV,TYPE_OF_VEHICLE_Sedan,TYPE_OF_VEHICLE_Sports Car,GENDER_male,VEHICLE_YEAR_before 2015
0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,1,0
1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1
2,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0
3,0,1,0,1,0,0,0,0,0,1,0,1,0,0,1,0
4,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0


In [33]:
print(dum_test_df.shape)
print(df_cat_cols.shape)

(45000, 16)
(105000, 16)


In [34]:
inter_train_df = train_df.drop(columns=dum_cols,axis=1)
inter_test_df = test_df.drop(columns=dum_cols,axis=1)


In [35]:
new_train_df = pd.concat([inter_train_df,df_cat_cols],axis=1)
new_train_df.shape

(105000, 27)

In [36]:
new_test_df = pd.concat([inter_test_df,dum_test_df],axis=1)
new_test_df.shape

(45000, 26)

In [None]:
# Plot the correlation matrix for all variables
corr_matrix = new_train_df.corr()
plt.figure(figsize=(20,10))
sns.heatmap(corr_matrix,annot=True)
plt.show()

### SPLIT INTO TRAIN AND VALIDATION SET


In [37]:
# Split into independent and dependent variables
X = new_train_df.drop('OUTCOME',axis=1)
y = new_train_df['OUTCOME']
print(X.shape,y.shape)

(105000, 26) (105000,)


In [38]:
# Split into train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,train_size = 0.7, random_state = 100)
print("Train dataset:",X_train.shape)
print("Test dataset:",X_test.shape)

Train dataset: (73500, 26)
Test dataset: (31500, 26)


In [39]:
# Scaling numerical columns for train dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
num_cols = X_train.select_dtypes(include=['float64','int64']).columns
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_train.head()

Unnamed: 0,ID,CREDIT_SCORE,VEHICLE_OWNERSHIP,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,...,EDUCATION_none,EDUCATION_university,DRIVING_EXPERIENCE_10-19y,DRIVING_EXPERIENCE_20-29y,DRIVING_EXPERIENCE_30y+,TYPE_OF_VEHICLE_SUV,TYPE_OF_VEHICLE_Sedan,TYPE_OF_VEHICLE_Sports Car,GENDER_male,VEHICLE_YEAR_before 2015
41575,1.177489,1.008126,0.455643,-1.188031,-1.039217,-0.465401,-1.027366,0.962342,-0.219669,1.041035,...,0,1,1,0,0,0,1,0,0,0
20113,-0.319762,-0.561903,0.455643,0.841729,0.962263,-0.465401,-0.69121,-0.487994,-0.219669,-0.390644,...,1,0,0,0,0,0,1,0,1,0
97947,0.556973,-0.452617,0.455643,0.841729,-1.039217,-0.465401,-0.355054,-0.487994,-0.219669,0.325196,...,1,0,0,0,0,1,0,0,1,1
60886,-0.849443,1.724859,0.455643,-1.188031,0.962263,-0.465401,1.325726,0.962342,6.607241,-0.390644,...,0,1,0,0,0,0,0,0,1,0
56259,0.980015,2.346702,0.455643,0.841729,-1.039217,0.943138,-0.018898,0.962342,1.487059,0.325196,...,0,1,0,1,0,1,0,0,0,1


In [40]:
# Apply Scaling to test dataset
num_cols = X_test.select_dtypes(include=['float64','int64']).columns
X_test[num_cols] = scaler.transform(X_test[num_cols])
X_test.head()

Unnamed: 0,ID,CREDIT_SCORE,VEHICLE_OWNERSHIP,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,...,EDUCATION_none,EDUCATION_university,DRIVING_EXPERIENCE_10-19y,DRIVING_EXPERIENCE_20-29y,DRIVING_EXPERIENCE_30y+,TYPE_OF_VEHICLE_SUV,TYPE_OF_VEHICLE_Sedan,TYPE_OF_VEHICLE_Sports Car,GENDER_male,VEHICLE_YEAR_before 2015
53620,-0.416194,-0.812902,0.455643,-1.188031,0.962263,-0.465401,1.325726,1.68751,-0.219669,-0.390644,...,0,1,0,1,0,0,1,0,0,1
60370,-0.713459,0.696816,-2.194701,0.841729,0.962263,0.710961,0.98957,-0.487994,-0.219669,0.325196,...,1,0,0,1,0,0,1,0,0,0
44788,1.782839,-0.352202,0.455643,0.841729,-1.039217,-0.465401,-1.363522,-0.487994,-0.219669,-0.390644,...,1,0,0,1,0,0,0,1,1,0
29000,0.569906,-0.337783,0.455643,0.841729,-1.039217,-0.196232,-0.018898,-0.487994,-0.219669,-0.390644,...,0,0,0,1,0,0,0,0,1,1
19049,-1.323538,0.25237,-2.194701,0.841729,-1.039217,-0.465401,-2.035834,-0.487994,-0.219669,-0.390644,...,1,0,0,0,0,0,0,0,1,1


In [43]:
# Apply Scaling to test dataset
# num_cols = X_test.select_dtypes(include=['float64','int64']).columns
new_test_df[num_cols] = scaler.transform(new_test_df[num_cols])
new_test_df.head()

Unnamed: 0,ID,CREDIT_SCORE,VEHICLE_OWNERSHIP,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,...,EDUCATION_none,EDUCATION_university,DRIVING_EXPERIENCE_10-19y,DRIVING_EXPERIENCE_20-29y,DRIVING_EXPERIENCE_30y+,TYPE_OF_VEHICLE_SUV,TYPE_OF_VEHICLE_Sedan,TYPE_OF_VEHICLE_Sports Car,GENDER_male,VEHICLE_YEAR_before 2015
0,-0.150251,1.537174,0.455643,-1.188031,0.962263,-0.465401,-0.355054,-0.487994,-0.219669,-0.390644,...,1,0,1,0,0,0,0,0,1,0
1,-1.272999,-1.87057,0.455643,-1.188031,-1.039217,0.959035,0.653414,-0.487994,-0.219669,-0.390644,...,0,0,0,1,0,0,1,0,0,1
2,0.389405,0.132416,-2.194701,0.841729,-1.039217,-0.465401,0.317258,-0.487994,-0.219669,-0.390644,...,1,0,1,0,0,1,0,0,1,0
3,-0.687872,0.631769,0.455643,0.841729,0.962263,-0.465401,1.661882,0.237174,-0.219669,-0.390644,...,0,0,0,1,0,1,0,0,1,0
4,0.474736,1.109694,-2.194701,-1.188031,0.962263,4.426925,0.98957,0.962342,-0.219669,-0.390644,...,1,0,1,0,0,0,0,0,1,0


In [45]:
from sklearn.linear_model import LogisticRegression


### Training

In [None]:
slr = LogisticRegression(random_state=0)

In [46]:
slr.fit(X_train,y_train)

LogisticRegression(random_state=0)

In [64]:
new_train_df['OUTCOME'].value_counts(dropna=False)

0.0    60622
1.0    44378
Name: OUTCOME, dtype: int64

### Predict with test data

In [47]:
y_pred = slr.predict(X_train)

In [49]:
from sklearn.metrics import log_loss
log_loss(y_test,slr.predict_proba(X_test))

0.6807772770178983

In [54]:
y_test_pred = slr.predict(X_test)

In [55]:
pd.DataFrame(y_test_pred).to_csv("submission_28102022.csv",index=False,header=False)

### Evaluate using unseen dataset

In [78]:
y_exa = slr.predict_proba(new_test_df)

In [79]:
pd.DataFrame(y_exa).to_csv("submission_4.csv",index=False,header=False)

In [82]:
y_exa.shape

(45000, 2)

In [53]:
new_test_df.head()

Unnamed: 0,ID,CREDIT_SCORE,VEHICLE_OWNERSHIP,MARRIED,CHILDREN,POSTAL_CODE,ANNUAL_MILEAGE,SPEEDING_VIOLATIONS,DUIS,PAST_ACCIDENTS,...,EDUCATION_none,EDUCATION_university,DRIVING_EXPERIENCE_10-19y,DRIVING_EXPERIENCE_20-29y,DRIVING_EXPERIENCE_30y+,TYPE_OF_VEHICLE_SUV,TYPE_OF_VEHICLE_Sedan,TYPE_OF_VEHICLE_Sports Car,GENDER_male,VEHICLE_YEAR_before 2015
0,-0.150251,1.537174,0.455643,-1.188031,0.962263,-0.465401,-0.355054,-0.487994,-0.219669,-0.390644,...,1,0,1,0,0,0,0,0,1,0
1,-1.272999,-1.87057,0.455643,-1.188031,-1.039217,0.959035,0.653414,-0.487994,-0.219669,-0.390644,...,0,0,0,1,0,0,1,0,0,1
2,0.389405,0.132416,-2.194701,0.841729,-1.039217,-0.465401,0.317258,-0.487994,-0.219669,-0.390644,...,1,0,1,0,0,1,0,0,1,0
3,-0.687872,0.631769,0.455643,0.841729,0.962263,-0.465401,1.661882,0.237174,-0.219669,-0.390644,...,0,0,0,1,0,1,0,0,1,0
4,0.474736,1.109694,-2.194701,-1.188031,0.962263,4.426925,0.98957,0.962342,-0.219669,-0.390644,...,1,0,1,0,0,0,0,0,1,0


## Baseline XGB classifier


In [66]:
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

In [67]:
xgb_cl.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [69]:
preds = xgb_cl.predict(X_test)


In [70]:
# Score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

0.561047619047619

In [76]:
unknown_pred1 = xgb_cl.predict_proba(new_test_df)

In [77]:
pd.DataFrame(unknown_pred1).to_csv("submission_3.csv",index=False,header=False)