# Load datasets into pandas dataframe with the necessary modules

In [1]:
#import necessary modules
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier,ExtraTreesRegressor,RandomForestClassifier,GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
#load the dataset
df=pd.read_csv("train_data.csv")

In [3]:
#let's show the first five rows of the dataset
df.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [4]:
#let's drop some less important columns
df.drop(columns=["Customer Id","YearOfObservation","Date_of_Occupancy","Geo_Code"],axis=1,inplace=True)

In [5]:
#shape of the dataset
df.shape

(7160, 10)

### DATASET INFOMATION

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Insured_Period      7160 non-null   float64
 1   Residential         7160 non-null   int64  
 2   Building_Painted    7160 non-null   object 
 3   Building_Fenced     7160 non-null   object 
 4   Garden              7153 non-null   object 
 5   Settlement          7160 non-null   object 
 6   Building Dimension  7054 non-null   float64
 7   Building_Type       7160 non-null   int64  
 8   NumberOfWindows     7160 non-null   object 
 9   Claim               7160 non-null   int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 559.5+ KB


In [7]:
df.head()

Unnamed: 0,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,NumberOfWindows,Claim
0,1.0,0,N,V,V,U,290.0,1,.,0
1,1.0,0,V,N,O,R,490.0,1,4,0
2,1.0,0,N,V,V,U,595.0,1,.,0
3,1.0,0,V,V,V,U,2840.0,1,.,0
4,1.0,0,V,N,O,R,680.0,1,3,0


In [8]:
# we now have 6543 rows and not more 7160
df.shape

(7160, 10)

In [9]:
# we are now left with 10 columns
df

Unnamed: 0,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,NumberOfWindows,Claim
0,1.000000,0,N,V,V,U,290.0,1,.,0
1,1.000000,0,V,N,O,R,490.0,1,4,0
2,1.000000,0,N,V,V,U,595.0,1,.,0
3,1.000000,0,V,V,V,U,2840.0,1,.,0
4,1.000000,0,V,N,O,R,680.0,1,3,0
...,...,...,...,...,...,...,...,...,...,...
7155,1.000000,1,V,V,V,U,,1,.,0
7156,1.000000,0,V,V,V,U,,2,.,1
7157,0.038251,0,V,V,V,U,,1,.,0
7158,1.000000,0,V,V,V,U,,1,.,0


In [10]:
#drop NAN
df.dropna(axis=0,inplace=True)

In [11]:
#Info about window's column
df["NumberOfWindows"].unique()

array(['   .', '4', '3', '2', '5', '>=10', '6', '7', '9', '8', '1'],
      dtype=object)

### from what we can see above, the column should range from 0-10. so we will replace the dot(.) with zero and >=10 with 10

In [12]:
df["NumberOfWindows"]=df["NumberOfWindows"].replace({"   ." : "0"})

In [13]:
df["NumberOfWindows"]=df["NumberOfWindows"].replace({">=10":"10"})

In [14]:
df["NumberOfWindows"]=df["NumberOfWindows"].astype(int)

In [15]:
df["NumberOfWindows"].unique()

array([ 0,  4,  3,  2,  5, 10,  6,  7,  9,  8,  1])

In [16]:
df.head()

Unnamed: 0,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,NumberOfWindows,Claim
0,1.0,0,N,V,V,U,290.0,1,0,0
1,1.0,0,V,N,O,R,490.0,1,4,0
2,1.0,0,N,V,V,U,595.0,1,0,0
3,1.0,0,V,V,V,U,2840.0,1,0,0
4,1.0,0,V,N,O,R,680.0,1,3,0


### encoding refers to the process of converting data from one format to another.  This can include converting data from one data type to another, such as converting text data to numerical data for use in a machine learning model.

### But since this columns contain binary data type, we will perform the encoding through mapping

In [17]:
# Mapping
build_fence_value = {"N" : 1,"V" : 0}
sett_value = {"U" : 1, "R" : 0}
Gard_value= {"V" : 1, "O" : 0}

In [18]:
df["Settlement"]=df["Settlement"].map(sett_value)
df["Garden"]=df["Garden"].map(Gard_value)
df["Building_Fenced"]=df["Building_Fenced"].map(build_fence_value)
df["Building_Painted"]=df["Building_Painted"].map(build_fence_value)

In [19]:
df.head()

Unnamed: 0,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,NumberOfWindows,Claim
0,1.0,0,1,0,1,1,290.0,1,0,0
1,1.0,0,0,1,0,0,490.0,1,4,0
2,1.0,0,1,0,1,1,595.0,1,0,0
3,1.0,0,0,0,1,1,2840.0,1,0,0
4,1.0,0,0,1,0,0,680.0,1,3,0


## Descriptive Statistics

In [20]:
df.describe()

Unnamed: 0,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,NumberOfWindows,Claim
count,7047.0,7047.0,7047.0,7047.0,7047.0,7047.0,7047.0,7047.0,7047.0,7047.0
mean,0.910667,0.304811,0.252306,0.510998,0.488861,0.488861,1882.219526,2.190436,2.214843,0.228892
std,0.238592,0.46036,0.434367,0.499915,0.499911,0.499911,2276.965895,0.942457,2.523878,0.420149
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,0.997268,0.0,0.0,0.0,0.0,0.0,528.0,2.0,0.0,0.0
50%,1.0,0.0,0.0,1.0,0.0,0.0,1083.0,2.0,2.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,2288.5,3.0,4.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,20940.0,4.0,10.0,1.0


# BASELINE

In [21]:
base=df["Claim"].value_counts(normalize=True).max()
round(base,3)

0.771

# SPLIT THE DATA

In [22]:
X=df.drop("Claim", axis=1)
y=df["Claim"]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# BUILD MODEL

In [24]:
model=GradientBoostingClassifier(random_state=42)

In [25]:
model.fit(X_train,y_train)

GradientBoostingClassifier(random_state=42)

# EVALUATE THE MODEL

In [26]:
pred=model.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
score=accuracy_score(pred,y_test)

In [29]:
round(score,3)

0.797