In [1]:
# Libraries

import pandas as pd

In [2]:
# Loading data from files into DataFrames

customer_details = pd.read_csv('/Users/shawn/Documents/Python Projects/customer_details.csv') 
policy_details = pd.read_csv("./customer_policy_details.csv")


# Creating columns into the Dataframes 

customer_details.columns = ["customer_id", "Gender", "age","driving licence present","region code",
                                         "previously insured", "vehicle age", "vehicle damage"]
policy_details.columns = ["customer_id", "annual premium (in Rs)", "sales channel code", "vintage", "response"]

In [3]:
# Generating the count of all the null values column wise


print("The count of null values in 'customer_id': ", customer_details["customer_id"].isnull().sum())
print("The count of null values in 'Gender': ", customer_details["Gender"].isnull().sum())
print("The count of null values in 'age': ", customer_details["age"].isnull().sum())
print("The count of null values in 'driving licence present': ", customer_details["driving licence present"].isnull().sum())
print("The count of null values in 'region code': ", customer_details["region code"].isnull().sum())
print("The count of null values in 'previously insured': ", customer_details["previously insured"].isnull().sum())
print("The count of null values in 'vehicle age': ", customer_details["vehicle age"].isnull().sum())
print("The count of null values in 'vehicle damage': ", customer_details["vehicle damage"].isnull().sum())

#Dropped null values of 'customer_id' because central tendencies for idâ€™s is not feasible.

customer_details = customer_details.dropna(subset=["customer_id"])
policy_details = policy_details.dropna(subset=["customer_id"])

The count of null values in 'customer_id':  386
The count of null values in 'Gender':  368
The count of null values in 'age':  368
The count of null values in 'driving licence present':  393
The count of null values in 'region code':  392
The count of null values in 'previously insured':  381
The count of null values in 'vehicle age':  381
The count of null values in 'vehicle damage':  407


In [4]:
#Replacing null values by mean or mode for customer_details

#Mode - It is the most frequent value in the data set.
customer_details["Gender"] = customer_details["Gender"].fillna(customer_details["Gender"].mode()[0])
customer_details["age"] = customer_details["age"].fillna(customer_details["age"].mean())
customer_details["driving licence present"] = customer_details["driving licence present"].fillna(customer_details["driving licence present"].mean())
customer_details["region code"] = customer_details["region code"].fillna(customer_details["region code"].mode()[0])
customer_details["previously insured"] = customer_details["previously insured"].fillna(customer_details["previously insured"].mode()[0])
customer_details["vehicle age"] = customer_details["vehicle age"].fillna(customer_details["vehicle age"].mode()[0])
customer_details["vehicle damage"] = customer_details["vehicle damage"].fillna(customer_details["vehicle damage"].mode()[0])


In [5]:
#Replacing null values by mean or mode for policy_details

policy_details["annual premium (in Rs)"] = policy_details["annual premium (in Rs)"].fillna(policy_details["annual premium (in Rs)"].mean())
policy_details["sales channel code"] = policy_details["sales channel code"].fillna(policy_details["sales channel code"].mode()[0])
policy_details["vintage"] = policy_details["vintage"].fillna(policy_details["vintage"].mean())
policy_details["response"] = policy_details["response"].fillna(policy_details["response"].mode()[0])

In [16]:
# An outlier is a data point in a data set that is distant from all the other observations.
# A data point that lies outside the overall distribution of the dataset.
# IQR is Inter Quartile Range, in it we focus on 25% and 75% (Percentile) 

customer_details.describe()
policy_details.describe()

Q1 = customer_details.describe().loc["25%","age"]
Q3 = customer_details.describe().loc["75%","age"]

IQR = Q3 - Q1
print(customer_details.loc[customer_details["age"] < (Q1 - IQR * 1.5),"age"].count())
print(customer_details.loc[customer_details["age"] > (Q3 + IQR * 1.5),"age"].count())

Q1 = policy_details.describe().loc["25%","annual premium (in Rs)"]
Q3 = policy_details.describe().loc["75%","annual premium (in Rs)"]

IQR = Q3 - Q1
print(policy_details.loc[policy_details["annual premium (in Rs)"] < (Q1 - IQR * 1.5), "annual premium (in Rs)"].count())
print(policy_details.loc[policy_details["annual premium (in Rs)"] > (Q3 + IQR * 1.5),"annual premium (in Rs)"].count())
                                                                    

Q1 = policy_details.describe().loc["25%","vintage"]
Q3 = policy_details.describe().loc["75%","vintage"]

policy_details["annual premium (in Rs)"].isoutlier

IQR = Q3 - Q1
print(policy_details.loc[policy_details["vintage"] < (Q1 - IQR * 1.5), "vintage"].count())
print(policy_details.loc[policy_details["vintage"] > (Q3 + IQR * 1.5), "vintage"].count())

0
0
0
10332
0
0


In [17]:
# The strip() method removes any leading (spaces at the beginning) and trailing (spaces at the end) characters (space is the default leading character to remove)

customer_details["Gender"] = customer_details["Gender"].str.strip()
customer_details["vehicle age"] = customer_details["vehicle age"].str.strip()
customer_details["vehicle damage"] = customer_details["vehicle damage"].str.strip()

In [18]:
# changing the case of all characters to upper case

customer_details["Gender"] = customer_details["Gender"].str.upper()
customer_details["vehicle age"] = customer_details["vehicle age"].str.upper()
customer_details["vehicle damage"] = customer_details["vehicle damage"].str.upper()

In [19]:
#

gender_dummy = pd.get_dummies(customer_details["Gender"])

va_dummy = pd.get_dummies(customer_details["vehicle age"])

vd_dummy = pd.get_dummies(customer_details["vehicle damage"])

dlp_dummy = pd.get_dummies(customer_details["driving licence present"])

pi_dummy = pd.get_dummies(customer_details["previously insured"])

response_dummy = pd.get_dummies(policy_details["response"])

In [20]:
# Dropping duplicate rows

customer_details = customer_details.drop_duplicates(keep = 'last')
policy_details = policy_details.drop_duplicates(keep = 'last')

In [21]:
# Merging the two data frames
merge_df = pd.merge(customer_details, policy_details, on = 'customer_id')

In [22]:
#Gender wise average annual premium

mean_prem_by_gender = merge_df.groupby(["Gender"])["annual premium (in Rs)"].mean()
print(mean_prem_by_gender)

#Age wise average annual premium

mean_prem_by_age = merge_df.groupby(["age"])["annual premium (in Rs)"].mean()
print(mean_prem_by_age)

#Vehicle age wise average annual premium.

mean_prem_by_va = merge_df.groupby(["vehicle age"])["annual premium (in Rs)"].mean()
print(mean_prem_by_va)

Gender
FEMALE    30492.028478
MALE      30623.619273
Name: annual premium (in Rs), dtype: float64
age
20.0    26924.620173
21.0    30564.475810
22.0    30823.778102
23.0    30688.606298
24.0    31183.802890
            ...     
81.0    31201.571429
82.0    37705.379310
83.0    31012.727273
84.0    35440.818182
85.0    29792.363636
Name: annual premium (in Rs), Length: 67, dtype: float64
vehicle age
1-2 YEAR     30524.629840
< 1 YEAR     30115.716367
> 2 YEARS    35657.520845
Name: annual premium (in Rs), dtype: float64


In [23]:
# Finding relationship between Person age and annual premium

Correlation_coefficient = merge_df["age"].corr(merge_df["annual premium (in Rs)"])

print(Correlation_coefficient)

# there is no relationship between Person age and annual premium because Correlation coefficient<0.5

0.06771515986613928
