In [203]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm as tqdm
from collections import Counter

In [204]:
train_real = pd.read_csv('/content/Train.csv')
test_real = pd.read_csv('/content/Test.csv')
sample_submission = pd.read_csv('/content/SampleSubmission.csv')
data_description = pd.read_csv('/content/VariableDefinitions.csv')

In [205]:
train = train_real.copy()
test = test_real.copy()

In [206]:
train

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_0040R73,2010-05-14,2011-05-13,Male,30,2010-05-14,1,Saloon,Black,TOYOTA,,,Car Classic,0
1,ID_0046BNK,2010-11-29,2011-11-28,Female,79,2010-11-29,1,JEEP,Grey,TOYOTA,,,Car Classic,1
2,ID_005QMC3,2010-03-21,2011-03-20,Male,43,2010-03-21,1,Saloon,Red,TOYOTA,,,Car Classic,0
3,ID_0079OHW,2010-08-21,2011-08-20,Male,2,2010-08-21,1,,,,,,CarSafe,0
4,ID_00BRP63,2010-08-29,2010-12-31,Entity,20,2010-08-29,3,,,,Lagos,Lagos,Muuve,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,ID_ZZA1SES,2010-05-25,2011-05-24,Female,30,2010-05-25,1,,Black,Range Rover,Ibeju/Lekki,Ibeju-Lekki,Car Classic,1
12075,ID_ZZDAC3K,2010-10-03,2011-10-02,Female,59,2010-10-03,1,,,,,,Car Classic,0
12076,ID_ZZIU2XC,2010-10-10,2011-10-08,Male,34,2010-10-10,1,,,,,,CarSafe,0
12077,ID_ZZRQ1NF,2010-02-27,2011-02-26,,120,2010-02-27,2,,White,TOYOTA,Victoria Island,Lagos,CVTP,0


Let's Start by dropping the ID Column

In [207]:
train.drop(['ID'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

In [208]:
train.describe()

Unnamed: 0,Age,No_Pol,target
count,12079.0,12079.0,12079.0
mean,42.234539,1.307227,0.120457
std,97.492565,0.733085,0.325509
min,-6099.0,1.0,0.0
25%,35.0,1.0,0.0
50%,41.0,1.0,0.0
75%,50.0,1.0,0.0
max,320.0,10.0,1.0


In [209]:
age_median = np.median(train['Age'])  # Value of age_median is 41.0 
for i in range(len(train)):
    if train['Age'][i] > 80 or train['Age'][i] < 18:
        train['Age'][i] = age_median
for i in range(len(test)):
    if test['Age'][i] > 80 or test['Age'][i] < 18:
        test['Age'][i] = age_median

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [210]:
train.isnull().sum()

Policy Start Date            0
Policy End Date              0
Gender                     359
Age                          0
First Transaction Date       0
No_Pol                       0
Car_Category              3738
Subject_Car_Colour        6962
Subject_Car_Make          2476
LGA_Name                  6476
State                     6488
ProductName                  0
target                       0
dtype: int64

--------------------------------------------------------------------------------
Since, The colour of the car is not (directly) related to whether the person would purchase the insurance or not, as well as due to a lot of null values (Almost 50%). Therefore I am dropping this column.
--------------------------------------------------------------------------------

In [211]:
train.drop(['Subject_Car_Colour'], axis=1, inplace=True)
test.drop(['Subject_Car_Colour'], axis=1, inplace=True)

In [212]:
train

Unnamed: 0,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,State,ProductName,target
0,2010-05-14,2011-05-13,Male,30,2010-05-14,1,Saloon,TOYOTA,,,Car Classic,0
1,2010-11-29,2011-11-28,Female,79,2010-11-29,1,JEEP,TOYOTA,,,Car Classic,1
2,2010-03-21,2011-03-20,Male,43,2010-03-21,1,Saloon,TOYOTA,,,Car Classic,0
3,2010-08-21,2011-08-20,Male,41,2010-08-21,1,,,,,CarSafe,0
4,2010-08-29,2010-12-31,Entity,20,2010-08-29,3,,,Lagos,Lagos,Muuve,1
...,...,...,...,...,...,...,...,...,...,...,...,...
12074,2010-05-25,2011-05-24,Female,30,2010-05-25,1,,Range Rover,Ibeju/Lekki,Ibeju-Lekki,Car Classic,1
12075,2010-10-03,2011-10-02,Female,59,2010-10-03,1,,,,,Car Classic,0
12076,2010-10-10,2011-10-08,Male,34,2010-10-10,1,,,,,CarSafe,0
12077,2010-02-27,2011-02-26,,41,2010-02-27,2,,TOYOTA,Victoria Island,Lagos,CVTP,0


In [213]:
sum(train['First Transaction Date'] == train['Policy Start Date'])

12079

--------------------------------------------------------------------------------
Another Observation from the above analysis shows that Policy Start Date and First Transaction Date are same. Therefore we should drop one of them otherwise it would act as a reduntant feature and add unnessacary complexity to our model.
--------------------------------------------------------------------------------

In [214]:
train.drop(['First Transaction Date'], axis=1, inplace=True)
test.drop(['First Transaction Date'], axis=1, inplace=True)

In [215]:
train

Unnamed: 0,Policy Start Date,Policy End Date,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,State,ProductName,target
0,2010-05-14,2011-05-13,Male,30,1,Saloon,TOYOTA,,,Car Classic,0
1,2010-11-29,2011-11-28,Female,79,1,JEEP,TOYOTA,,,Car Classic,1
2,2010-03-21,2011-03-20,Male,43,1,Saloon,TOYOTA,,,Car Classic,0
3,2010-08-21,2011-08-20,Male,41,1,,,,,CarSafe,0
4,2010-08-29,2010-12-31,Entity,20,3,,,Lagos,Lagos,Muuve,1
...,...,...,...,...,...,...,...,...,...,...,...
12074,2010-05-25,2011-05-24,Female,30,1,,Range Rover,Ibeju/Lekki,Ibeju-Lekki,Car Classic,1
12075,2010-10-03,2011-10-02,Female,59,1,,,,,Car Classic,0
12076,2010-10-10,2011-10-08,Male,34,1,,,,,CarSafe,0
12077,2010-02-27,2011-02-26,,41,2,,TOYOTA,Victoria Island,Lagos,CVTP,0


In [216]:
train['Gender'].value_counts()

Male            7617
Female          3327
Entity           277
Joint Gender     223
NOT STATED       175
NO GENDER         66
SEX               35
Name: Gender, dtype: int64

In [217]:
train['Gender'].dtype

dtype('O')

In [218]:
train['Gender'].fillna('Male', inplace=True)

In [219]:
train['Gender'].value_counts()

Male            7976
Female          3327
Entity           277
Joint Gender     223
NOT STATED       175
NO GENDER         66
SEX               35
Name: Gender, dtype: int64

In [220]:
train['Gender'].replace(['Entity', 'Joint Gender', 'NOT STATED', 'NO GENDER', 'SEX'], 'Not Stated', inplace=True)
test['Gender'].replace(['Entity', 'Joint Gender', 'NOT STATED', 'NO GENDER', 'SEX'], 'Not Stated', inplace=True)

In [221]:
train['Gender'].value_counts()

Male          7976
Female        3327
Not Stated     776
Name: Gender, dtype: int64

In [222]:
train.isnull().sum()

Policy Start Date       0
Policy End Date         0
Gender                  0
Age                     0
No_Pol                  0
Car_Category         3738
Subject_Car_Make     2476
LGA_Name             6476
State                6488
ProductName             0
target                  0
dtype: int64

In [223]:
train['Car_Category'].value_counts()

Saloon                     6034
JEEP                       2021
Truck                        97
Bus                          54
Mini Bus                     36
Pick Up                      30
Motorcycle                   17
Mini Van                     13
Sedan                        12
Wagon                        10
Shape Of Vehicle Chasis       6
Station 4 Wheel               5
Van                           3
Pick Up > 3 Tons              1
CAMRY CAR HIRE                1
Tipper Truck                  1
Name: Car_Category, dtype: int64

In [224]:
train['Car_Category'].replace(['Shape Of Vehicle Chasis', 'Station 4 Wheel', 'Van', 'Pick Up > 3 Tons', 'CAMRY CAR HIRE', 'Tipper Truck'], 'Saloon', inplace=True)
test['Car_Category'].replace(['Shape Of Vehicle Chasis', 'Station 4 Wheel', 'Van', 'Pick Up > 3 Tons', 'CAMRY CAR HIRE', 'Tipper Truck'], 'Saloon', inplace=True)

In [225]:
train['Car_Category'].fillna(method='ffill', inplace=True)
test['Car_Category'].fillna(method='ffill', inplace=True)

In [226]:
train.isnull().sum()

Policy Start Date       0
Policy End Date         0
Gender                  0
Age                     0
No_Pol                  0
Car_Category            0
Subject_Car_Make     2476
LGA_Name             6476
State                6488
ProductName             0
target                  0
dtype: int64

In [227]:
train['Subject_Car_Make'].value_counts().sort_values()[:45]

Jincheng           1
Buik               1
Rols Royce         1
KA                 1
ABG                1
Tata               1
Raston             1
COMMANDER          1
CHANGAN            1
Yamaha             1
Ashok Leyland      1
ZOYTE              1
Motorcycle         1
MG                 1
Bajaj              1
BRILLIANCE         1
Geely              1
Howo               1
Caddillac          1
REXTON             1
Lincoln            1
Black              2
Fiat               2
Astra              2
FOTON              2
Wrangler Jeep      2
Chrysler           2
Hummer             2
Seat               2
Grand Cherokee     2
Innson             2
MINI COOPER        3
LIBERTY            3
Jaguar             4
Subaru             4
Renault            4
GAC                5
GMC                5
Man                5
Land Rover.        6
Dodge              6
Opel               7
Isuzu              8
Suzuki             8
As Attached       11
Name: Subject_Car_Make, dtype: int64

In [228]:
train['Subject_Car_Make'].value_counts().sort_values()

Jincheng         1
Buik             1
Rols Royce       1
KA               1
ABG              1
              ... 
Hyundai        457
Mercedes       521
Lexus          604
Honda         1043
TOYOTA        4975
Name: Subject_Car_Make, Length: 74, dtype: int64

In [229]:
# First 44 values have count as 1. we are gonna replace that.  
rep = train['Subject_Car_Make'].value_counts().sort_values().index[:44]
wit = train['Subject_Car_Make'].value_counts().sort_values().index[-5:]

In [230]:
rep

Index(['Jincheng', 'Buik', 'Rols Royce', 'KA', 'ABG', 'Tata', 'Raston',
       'COMMANDER', 'CHANGAN', 'Yamaha', 'Ashok Leyland', 'ZOYTE',
       'Motorcycle', 'MG', 'Bajaj', 'BRILLIANCE', 'Geely', 'Howo', 'Caddillac',
       'REXTON', 'Lincoln', 'Black', 'Fiat', 'Astra', 'FOTON', 'Wrangler Jeep',
       'Chrysler', 'Hummer', 'Seat', 'Grand Cherokee', 'Innson', 'MINI COOPER',
       'LIBERTY', 'Jaguar', 'Subaru', 'Renault', 'GAC', 'GMC', 'Man',
       'Land Rover.', 'Dodge', 'Opel', 'Isuzu', 'Suzuki'],
      dtype='object')

In [231]:
#train['Subject_Car_Make'].replace(rep, np.random.choice(wit)).value_counts()
#train['Subject_Car_Make'].replace('.', np.random.choice(wit)).value_counts()

In [232]:
for i in range(len(train)):
    if train['Subject_Car_Make'][i] in rep:
        train['Subject_Car_Make'][i] = np.random.choice(wit)

for i in range(len(test)):
    if test['Subject_Car_Make'][i] in rep:
        test['Subject_Car_Make'][i] = np.random.choice(wit)

train['Subject_Car_Make'].fillna(method='ffill', inplace=True)
test['Subject_Car_Make'].fillna(method='ffill', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [233]:
train['Subject_Car_Make'].value_counts()

TOYOTA         6247
Honda          1383
Lexus           800
Mercedes        675
Hyundai         589
Kia             426
Ford            354
Nissan          331
Iveco           153
Volkswagen      151
Range Rover     140
Mitsubishi      114
Land Rover       84
BMW              78
ACURA            78
Mazda            60
Peugeot          54
.                51
Volvo            47
Mack             41
Audi             35
Infiniti         31
Pontiac          29
Chevrolet        25
DAF              23
Skoda            21
Jeep             19
Porsche          14
As Attached      13
Scania           13
Name: Subject_Car_Make, dtype: int64

In [234]:
train.isnull().sum()

Policy Start Date       0
Policy End Date         0
Gender                  0
Age                     0
No_Pol                  0
Car_Category            0
Subject_Car_Make        0
LGA_Name             6476
State                6488
ProductName             0
target                  0
dtype: int64

In [235]:
train['LGA_Name'].value_counts()

Victoria Island     1162
Ikeja                401
Surulere             288
Abuja Municipal      232
Lagos Mainland       222
                    ... 
Ijora                  1
Akoko North West       1
IBA                    1
Yorro                  1
Ughelli North          1
Name: LGA_Name, Length: 258, dtype: int64

In [236]:
np.where(train['LGA_Name'].value_counts().sort_values()==1)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71]),)

In [237]:
train['LGA_Name'].value_counts().sort_values()[:90]

Ughelli North           1
kumbotso                1
Ogba/Egbema/Ndoni       1
Ovia SouthWest          1
Isoko North             1
                       ..
Orile-Iganmu            2
ONDO                    2
Jibia                   2
Gombe                   2
Calabar Municipality    2
Name: LGA_Name, Length: 90, dtype: int64

In [238]:
rep = train['LGA_Name'].value_counts().sort_values().index[:90]
wit = train['LGA_Name'].value_counts().sort_values().index[-5:]

In [239]:
for i in range(len(train)):
    if train['LGA_Name'][i] in rep:
        train['LGA_Name'][i] = np.random.choice(wit)

for i in range(len(test)):
    if test['LGA_Name'][i] in rep:
        test['LGA_Name'][i] = np.random.choice(wit)

train['LGA_Name'].fillna(method='ffill', inplace=True)
test['LGA_Name'].fillna(method='ffill', inplace=True)

train['LGA_Name'].fillna('Lagos', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [240]:
train['LGA_Name'].value_counts()

Victoria Island    2569
Ikeja               888
Surulere            654
Abuja Municipal     592
Lagos Mainland      518
                   ... 
Aboh-Mbaise           2
 IFAKO                2
AJAO ESTATE           2
Ado-Ekiti             2
Umuahia South         2
Name: LGA_Name, Length: 168, dtype: int64

In [241]:
train.isnull().sum()

Policy Start Date       0
Policy End Date         0
Gender                  0
Age                     0
No_Pol                  0
Car_Category            0
Subject_Car_Make        0
LGA_Name                0
State                6488
ProductName             0
target                  0
dtype: int64

Let's just drop the state column for now.

In [242]:
train.drop(['State'], axis=1, inplace=True)
test.drop(['State'], axis=1, inplace=True)

In [243]:
train

Unnamed: 0,Policy Start Date,Policy End Date,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,ProductName,target
0,2010-05-14,2011-05-13,Male,30,1,Saloon,TOYOTA,Lagos,Car Classic,0
1,2010-11-29,2011-11-28,Female,79,1,JEEP,TOYOTA,Lagos,Car Classic,1
2,2010-03-21,2011-03-20,Male,43,1,Saloon,TOYOTA,Lagos,Car Classic,0
3,2010-08-21,2011-08-20,Male,41,1,Saloon,TOYOTA,Lagos,CarSafe,0
4,2010-08-29,2010-12-31,Not Stated,20,3,Saloon,TOYOTA,Lagos,Muuve,1
...,...,...,...,...,...,...,...,...,...,...
12074,2010-05-25,2011-05-24,Female,30,1,Saloon,Range Rover,Ibeju/Lekki,Car Classic,1
12075,2010-10-03,2011-10-02,Female,59,1,Saloon,Range Rover,Ibeju/Lekki,Car Classic,0
12076,2010-10-10,2011-10-08,Male,34,1,Saloon,Range Rover,Ibeju/Lekki,CarSafe,0
12077,2010-02-27,2011-02-26,Male,41,2,Saloon,TOYOTA,Victoria Island,CVTP,0


In [244]:
test.isnull().sum()

Policy Start Date      0
Policy End Date        0
Gender               156
Age                    0
No_Pol                 0
Car_Category           5
Subject_Car_Make       0
LGA_Name               0
ProductName            0
dtype: int64

In [245]:
test['Gender'].fillna(method='ffill', inplace=True)
test['Car_Category'].fillna(method='ffill', inplace=True)
test['Car_Category'].fillna('Saloon', inplace=True)

In [246]:
test.isnull().sum()

Policy Start Date    0
Policy End Date      0
Gender               0
Age                  0
No_Pol               0
Car_Category         0
Subject_Car_Make     0
LGA_Name             0
ProductName          0
dtype: int64

In [247]:
test

Unnamed: 0,Policy Start Date,Policy End Date,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,ProductName
0,2010-04-24,2011-03-27,Not Stated,25,1,Saloon,Iveco,Victoria Island,CVTP
1,2010-01-01,2010-12-31,Not Stated,41,4,Saloon,TOYOTA,Victoria Island,Muuve
2,2010-10-23,2011-10-22,Female,46,1,Saloon,Ford,Abuja Municipal,Car Classic
3,2010-10-14,2011-10-13,Male,32,1,Saloon,Ford,Kosofe,Car Classic
4,2010-09-16,2010-12-31,Male,41,4,Saloon,TOYOTA,Victoria Island,Muuve
...,...,...,...,...,...,...,...,...,...
5172,2010-07-18,2011-07-17,Male,48,1,Saloon,Honda,Lagos Island,CarSafe
5173,2010-12-04,2011-12-03,Male,50,1,Saloon,Honda,Victoria Island,Car Classic
5174,2010-09-24,2011-09-23,Male,41,1,Saloon,Honda,Victoria Island,CarSafe
5175,2010-07-17,2011-07-16,Female,52,1,Saloon,TOYOTA,Victoria Island,Car Classic


In [248]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12079 entries, 0 to 12078
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Policy Start Date  12079 non-null  object
 1   Policy End Date    12079 non-null  object
 2   Gender             12079 non-null  object
 3   Age                12079 non-null  int64 
 4   No_Pol             12079 non-null  int64 
 5   Car_Category       12079 non-null  object
 6   Subject_Car_Make   12079 non-null  object
 7   LGA_Name           12079 non-null  object
 8   ProductName        12079 non-null  object
 9   target             12079 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 943.8+ KB


In [249]:
train['Policy Start Date'] = pd.to_datetime(train['Policy Start Date'])
train['Policy End Date'] = pd.to_datetime(train['Policy End Date'])

test['Policy Start Date'] = pd.to_datetime(test['Policy Start Date'])
test['Policy End Date'] = pd.to_datetime(test['Policy End Date'])

In [250]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12079 entries, 0 to 12078
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Policy Start Date  12079 non-null  datetime64[ns]
 1   Policy End Date    12079 non-null  datetime64[ns]
 2   Gender             12079 non-null  object        
 3   Age                12079 non-null  int64         
 4   No_Pol             12079 non-null  int64         
 5   Car_Category       12079 non-null  object        
 6   Subject_Car_Make   12079 non-null  object        
 7   LGA_Name           12079 non-null  object        
 8   ProductName        12079 non-null  object        
 9   target             12079 non-null  int64         
dtypes: datetime64[ns](2), int64(3), object(5)
memory usage: 943.8+ KB


In [251]:
train

Unnamed: 0,Policy Start Date,Policy End Date,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,ProductName,target
0,2010-05-14,2011-05-13,Male,30,1,Saloon,TOYOTA,Lagos,Car Classic,0
1,2010-11-29,2011-11-28,Female,79,1,JEEP,TOYOTA,Lagos,Car Classic,1
2,2010-03-21,2011-03-20,Male,43,1,Saloon,TOYOTA,Lagos,Car Classic,0
3,2010-08-21,2011-08-20,Male,41,1,Saloon,TOYOTA,Lagos,CarSafe,0
4,2010-08-29,2010-12-31,Not Stated,20,3,Saloon,TOYOTA,Lagos,Muuve,1
...,...,...,...,...,...,...,...,...,...,...
12074,2010-05-25,2011-05-24,Female,30,1,Saloon,Range Rover,Ibeju/Lekki,Car Classic,1
12075,2010-10-03,2011-10-02,Female,59,1,Saloon,Range Rover,Ibeju/Lekki,Car Classic,0
12076,2010-10-10,2011-10-08,Male,34,1,Saloon,Range Rover,Ibeju/Lekki,CarSafe,0
12077,2010-02-27,2011-02-26,Male,41,2,Saloon,TOYOTA,Victoria Island,CVTP,0


In [252]:
(train['Policy End Date'] - train['Policy Start Date']).value_counts()

364 days     9542
363 days      566
181 days      160
365 days       83
182 days       63
             ... 
214 days        1
3651 days       1
157 days        1
100 days        1
158 days        1
Length: 301, dtype: int64

In [253]:
train['Policy Start Month'] = 0
test['Policy Start Month'] = 0
train['Policy Start Year'] = 0
test['Policy Start Year'] = 0


train['Policy End Month'] = 0
test['Policy End Month'] = 0
train['Policy End Year'] = 0
test['Policy End Year'] = 0


for i in tqdm(range(len(train))):
    sts = train['Policy Start Date'][i].month
    ets = train['Policy End Date'][i].month 
    sts1 = train['Policy Start Date'][i].year
    ets1 = train['Policy End Date'][i].year 

    train['Policy Start Month'][i] = sts
    train['Policy End Month'][i] = ets
    train['Policy Start Year'][i] = sts1
    train['Policy End Year'][i] = ets1

for i in tqdm(range(len(test))):
    sts = test['Policy Start Date'][i].month
    ets = test['Policy End Date'][i].month 
    sts1 = test['Policy Start Date'][i].year
    ets1 = test['Policy End Date'][i].year 
    
    test['Policy Start Month'][i] = sts
    test['Policy End Month'][i] = ets
    test['Policy Start Year'][i] = sts1
    test['Policy End Year'][i] = ets1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 12079/12079 [00:04<00:00, 2567.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.

In [254]:
train

Unnamed: 0,Policy Start Date,Policy End Date,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,ProductName,target,Policy Start Month,Policy Start Year,Policy End Month,Policy End Year
0,2010-05-14,2011-05-13,Male,30,1,Saloon,TOYOTA,Lagos,Car Classic,0,5,2010,5,2011
1,2010-11-29,2011-11-28,Female,79,1,JEEP,TOYOTA,Lagos,Car Classic,1,11,2010,11,2011
2,2010-03-21,2011-03-20,Male,43,1,Saloon,TOYOTA,Lagos,Car Classic,0,3,2010,3,2011
3,2010-08-21,2011-08-20,Male,41,1,Saloon,TOYOTA,Lagos,CarSafe,0,8,2010,8,2011
4,2010-08-29,2010-12-31,Not Stated,20,3,Saloon,TOYOTA,Lagos,Muuve,1,8,2010,12,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,2010-05-25,2011-05-24,Female,30,1,Saloon,Range Rover,Ibeju/Lekki,Car Classic,1,5,2010,5,2011
12075,2010-10-03,2011-10-02,Female,59,1,Saloon,Range Rover,Ibeju/Lekki,Car Classic,0,10,2010,10,2011
12076,2010-10-10,2011-10-08,Male,34,1,Saloon,Range Rover,Ibeju/Lekki,CarSafe,0,10,2010,10,2011
12077,2010-02-27,2011-02-26,Male,41,2,Saloon,TOYOTA,Victoria Island,CVTP,0,2,2010,2,2011


Now that we're done with the date time fields, as of now. therefore we can drop them for now.

In [255]:
train.drop(['Policy Start Date', 'Policy End Date'], axis=1, inplace=True)
test.drop(['Policy Start Date', 'Policy End Date'], axis=1, inplace=True)

In [256]:
train

Unnamed: 0,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,ProductName,target,Policy Start Month,Policy Start Year,Policy End Month,Policy End Year
0,Male,30,1,Saloon,TOYOTA,Lagos,Car Classic,0,5,2010,5,2011
1,Female,79,1,JEEP,TOYOTA,Lagos,Car Classic,1,11,2010,11,2011
2,Male,43,1,Saloon,TOYOTA,Lagos,Car Classic,0,3,2010,3,2011
3,Male,41,1,Saloon,TOYOTA,Lagos,CarSafe,0,8,2010,8,2011
4,Not Stated,20,3,Saloon,TOYOTA,Lagos,Muuve,1,8,2010,12,2010
...,...,...,...,...,...,...,...,...,...,...,...,...
12074,Female,30,1,Saloon,Range Rover,Ibeju/Lekki,Car Classic,1,5,2010,5,2011
12075,Female,59,1,Saloon,Range Rover,Ibeju/Lekki,Car Classic,0,10,2010,10,2011
12076,Male,34,1,Saloon,Range Rover,Ibeju/Lekki,CarSafe,0,10,2010,10,2011
12077,Male,41,2,Saloon,TOYOTA,Victoria Island,CVTP,0,2,2010,2,2011


In [257]:
train['target'].value_counts()

0    10624
1     1455
Name: target, dtype: int64

In [258]:
all_data = pd.concat((train, test), axis=0)
all_data

Unnamed: 0,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,ProductName,target,Policy Start Month,Policy Start Year,Policy End Month,Policy End Year
0,Male,30,1,Saloon,TOYOTA,Lagos,Car Classic,0.0,5,2010,5,2011
1,Female,79,1,JEEP,TOYOTA,Lagos,Car Classic,1.0,11,2010,11,2011
2,Male,43,1,Saloon,TOYOTA,Lagos,Car Classic,0.0,3,2010,3,2011
3,Male,41,1,Saloon,TOYOTA,Lagos,CarSafe,0.0,8,2010,8,2011
4,Not Stated,20,3,Saloon,TOYOTA,Lagos,Muuve,1.0,8,2010,12,2010
...,...,...,...,...,...,...,...,...,...,...,...,...
5172,Male,48,1,Saloon,Honda,Lagos Island,CarSafe,,7,2010,7,2011
5173,Male,50,1,Saloon,Honda,Victoria Island,Car Classic,,12,2010,12,2011
5174,Male,41,1,Saloon,Honda,Victoria Island,CarSafe,,9,2010,9,2011
5175,Female,52,1,Saloon,TOYOTA,Victoria Island,Car Classic,,7,2010,7,2011


In [259]:
'''from sklearn.model_selection import train_test_split
X = train.drop(['target'], axis=1)
y = train['target']
train_x, test_x, train_y, test_y = train_test_split(X, y, shuffle=True, random_state=42, test_size=0.25, stratify=y)'''

"from sklearn.model_selection import train_test_split\nX = train.drop(['target'], axis=1)\ny = train['target']\ntrain_x, test_x, train_y, test_y = train_test_split(X, y, shuffle=True, random_state=42, test_size=0.25, stratify=y)"

In [260]:
'''from sklearn.preprocessing import LabelEncoder
le_gender = LabelEncoder()
train_x['Gender'] = le_gender.fit_transform(train_x['Gender'])
text_x['Gender'] = le_gender.transform(train_x['Gender'])
test['Gender'] = le_gender.transform(test['Gender'])

le_car_category = LabelEncoder()
train_x['Car_Category'] = le_car_category.fit_transform(train_x['Car_Category'])
text_x['Car_Category'] = le_car_category.transform(train_x['Car_Category'])
test['Car_Category'] = le_car_category.transform(test['Car_Category'])

le_car_make = LabelEncoder()
train_x['Subject_Car_Make'] = le_car_make.fit_transform(train_x['Subject_Car_Make'])
text_x['Subject_Car_Make'] = le_car_make.transform(train_x['Subject_Car_Make'])
test['Subject_Car_Make'] = le_car_make.transform(test['Subject_Car_Make'])

le_lga = LabelEncoder()
train_x['LGA_Name'] = le_lga.fit_transform(train_x['LGA_Name'])
text_x['LGA_Name'] = le_lga.transform(train_x['LGA_Name'])
test['LGA_Name'] = le_lga.transform(test['LGA_Name'])

le_product_name = LabelEncoder()
train_x['ProductName'] = le_product_name.fit_transform(train_x['ProductName'])
text_x['ProductName'] = le_product_name.transform(train_x['ProductName'])
test['ProductName'] = le_product_name.transform(test['ProductName'])'''

"from sklearn.preprocessing import LabelEncoder\nle_gender = LabelEncoder()\ntrain_x['Gender'] = le_gender.fit_transform(train_x['Gender'])\ntext_x['Gender'] = le_gender.transform(train_x['Gender'])\ntest['Gender'] = le_gender.transform(test['Gender'])\n\nle_car_category = LabelEncoder()\ntrain_x['Car_Category'] = le_car_category.fit_transform(train_x['Car_Category'])\ntext_x['Car_Category'] = le_car_category.transform(train_x['Car_Category'])\ntest['Car_Category'] = le_car_category.transform(test['Car_Category'])\n\nle_car_make = LabelEncoder()\ntrain_x['Subject_Car_Make'] = le_car_make.fit_transform(train_x['Subject_Car_Make'])\ntext_x['Subject_Car_Make'] = le_car_make.transform(train_x['Subject_Car_Make'])\ntest['Subject_Car_Make'] = le_car_make.transform(test['Subject_Car_Make'])\n\nle_lga = LabelEncoder()\ntrain_x['LGA_Name'] = le_lga.fit_transform(train_x['LGA_Name'])\ntext_x['LGA_Name'] = le_lga.transform(train_x['LGA_Name'])\ntest['LGA_Name'] = le_lga.transform(test['LGA_Name']

In [261]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['Gender', 'Car_Category', 'Subject_Car_Make', 'LGA_Name', 'ProductName']
for i in cat_cols:
    le = LabelEncoder()
    all_data[i] = le.fit_transform(all_data[i])


In [262]:
all_data

Unnamed: 0,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,ProductName,target,Policy Start Month,Policy Start Year,Policy End Month,Policy End Year
0,1,30,1,6,28,112,1,0.0,5,2010,5,2011
1,0,79,1,1,28,112,1,1.0,11,2010,11,2011
2,1,43,1,6,28,112,1,0.0,3,2010,3,2011
3,1,41,1,6,28,112,5,0.0,8,2010,8,2011
4,2,20,3,6,28,112,8,1.0,8,2010,12,2010
...,...,...,...,...,...,...,...,...,...,...,...,...
5172,1,48,1,6,9,113,5,,7,2010,7,2011
5173,1,50,1,6,9,172,1,,12,2010,12,2011
5174,1,41,1,6,9,172,5,,9,2010,9,2011
5175,0,52,1,6,28,172,1,,7,2010,7,2011


In [263]:
train = all_data.iloc[:len(train)]
test = all_data.iloc[len(train):]

In [264]:
test.drop(['target'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [265]:
train

Unnamed: 0,Gender,Age,No_Pol,Car_Category,Subject_Car_Make,LGA_Name,ProductName,target,Policy Start Month,Policy Start Year,Policy End Month,Policy End Year
0,1,30,1,6,28,112,1,0.0,5,2010,5,2011
1,0,79,1,1,28,112,1,1.0,11,2010,11,2011
2,1,43,1,6,28,112,1,0.0,3,2010,3,2011
3,1,41,1,6,28,112,5,0.0,8,2010,8,2011
4,2,20,3,6,28,112,8,1.0,8,2010,12,2010
...,...,...,...,...,...,...,...,...,...,...,...,...
12074,0,30,1,6,25,75,1,1.0,5,2010,5,2011
12075,0,59,1,6,25,75,1,0.0,10,2010,10,2011
12076,1,34,1,6,25,75,5,0.0,10,2010,10,2011
12077,1,41,2,6,28,172,0,0.0,2,2010,2,2011


In [266]:
from sklearn.model_selection import train_test_split
X = train.drop(['target'], axis=1)
y = train['target']
train_x_orig, test_x_orig, train_y_orig, test_y_orig = train_test_split(X, y, shuffle=True, random_state=42, test_size=0.25, stratify=y)

In [281]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy=1)
train_x, train_y = ros.fit_resample(train_x_orig, train_y_orig)



In [282]:
Counter(train_y)

Counter({0.0: 7968, 1.0: 7968})

In [283]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [284]:
lr = LogisticRegression(max_iter=1500, C=1, class_weight={0: 0.5, 1: 1})
lr.fit(train_x, train_y)

pred_lr = lr.predict(test_x)
print(confusion_matrix(test_y, pred_lr))
print('Accuracy_Score: ', accuracy_score(test_y, pred_lr))
print('F1_score: ', f1_score(test_y, pred_lr))

[[1112 1544]
 [  21  343]]
Accuracy_Score:  0.4817880794701987
F1_score:  0.3047534429142603


In [269]:
dtc = DecisionTreeClassifier()
dtc.fit(train_x, train_y)

pred_dtc = dtc.predict(test_x)
print(confusion_matrix(test_y, pred_dtc))
print('Accuracy_Score: ', accuracy_score(test_y, pred_dtc))
print('F1_score: ', f1_score(test_y, pred_dtc))

[[2297  359]
 [ 244  120]]
Accuracy_Score:  0.8003311258278145
F1_score:  0.2846975088967972


In [270]:
nb = GaussianNB()
nb.fit(train_x, train_y)

pred_nb = nb.predict(test_x)
print(confusion_matrix(test_y, pred_nb))
print('Accuracy_Score: ', accuracy_score(test_y, pred_nb))
print('F1_score: ', f1_score(test_y, pred_nb))

[[ 183 2473]
 [  20  344]]
Accuracy_Score:  0.17450331125827814
F1_score:  0.21628418736246463


In [271]:
svc = SVC(kernel='sigmoid', degree=5, cache_size=500, tol=0.0001)
svc.fit(train_x, train_y)

pred_svc = svc.predict(test_x)
print(confusion_matrix(test_y, pred_svc))
print('Accuracy_Score: ', accuracy_score(test_y, pred_svc))
print('F1_score: ', f1_score(test_y, pred_svc))

[[2656    0]
 [ 364    0]]
Accuracy_Score:  0.8794701986754967
F1_score:  0.0


In [272]:
rfc = RandomForestClassifier(n_estimators=1000, criterion='gini')
rfc.fit(train_x, train_y)

pred_rfc = rfc.predict(test_x)
print(confusion_matrix(test_y, pred_rfc))
print('Accuracy_Score: ', accuracy_score(test_y, pred_rfc))
print('F1_score: ', f1_score(test_y, pred_rfc))

[[2459  197]
 [ 260  104]]
Accuracy_Score:  0.8486754966887418
F1_score:  0.31278195488721805


Hyperparameter Optimization

In [None]:
rfc1 = RandomForestClassifier()

search_space = dict()
search_space['n_estimators'] = [100, 300, 500, 600, 800, 1000, 1200]
search_space['criterion'] = ['gini', 'entropy']
search_space['max_depth'] = [5, 8, 15, 25, 30]
search_space['min_samples_split'] = [2, 5, 10, 15, 100]
search_space['min_samples_leaf'] = [1, 2, 5, 10]

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

search = RandomizedSearchCV(rfc1, search_space, n_iter=100, scoring='f1_micro', n_jobs=-1, cv=cv, random_state=1)

result = search.fit(train_x, train_y)



In [None]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [273]:
xgb = GradientBoostingClassifier(n_estimators=1000)
xgb.fit(train_x, train_y)

pred_xgb = xgb.predict(test_x)
print(confusion_matrix(test_y, pred_xgb))
print('Accuracy_Score: ', accuracy_score(test_y, pred_xgb))
print('F1_score: ', f1_score(test_y, pred_xgb))

[[2228  428]
 [ 184  180]]
Accuracy_Score:  0.7973509933774835
F1_score:  0.37037037037037035


In [274]:
vc = VotingClassifier(estimators=[
                                  ('lr', lr), ('rfc', rfc)
], voting='hard')
vc.fit(train_x, train_y)

pred_vc = vc.predict(test_x)
print(confusion_matrix(test_y, pred_vc))
print('Accuracy_Score: ', accuracy_score(test_y, pred_vc))
print('F1_score: ', f1_score(test_y, pred_vc))

[[2519  137]
 [ 292   72]]
Accuracy_Score:  0.8579470198675496
F1_score:  0.25130890052356025


In [90]:
bc = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=100)
bc.fit(train_x, train_y)

pred_bc = bc.predict(test_x)
print(confusion_matrix(test_y, pred_bc))
print('Accuracy_Score: ', accuracy_score(test_y, pred_bc))
print('F1_score: ', f1_score(test_y, pred_bc))

[[2395  261]
 [ 233  131]]
Accuracy_Score:  0.8364238410596027
F1_score:  0.34656084656084657


In [275]:
test_prediction = lr.predict(test)

In [276]:
Counter(test_prediction)

Counter({0.0: 3160, 1.0: 2017})

In [277]:
sample_submission

Unnamed: 0,ID,target
0,ID_009D84L,0
1,ID_01DO2EQ,0
2,ID_01QM0NU,0
3,ID_024NJLZ,0
4,ID_02BYET3,0
...,...,...
5172,ID_ZYXX5AF,0
5173,ID_ZYYOZ5L,0
5174,ID_ZZ1GTKD,0
5175,ID_ZZDXQSI,0


In [278]:
sample_submission['target'] = test_prediction

In [279]:
sample_submission

Unnamed: 0,ID,target
0,ID_009D84L,1.0
1,ID_01DO2EQ,1.0
2,ID_01QM0NU,0.0
3,ID_024NJLZ,1.0
4,ID_02BYET3,0.0
...,...,...
5172,ID_ZYXX5AF,0.0
5173,ID_ZYYOZ5L,0.0
5174,ID_ZZ1GTKD,0.0
5175,ID_ZZDXQSI,0.0


In [280]:
sample_submission.to_csv('predictions.csv', index=False)