In [None]:
## EDA Notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/siglimumuni/Datasets/master/customer-data.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.groupby('income')['credit_score'].mean()

In [None]:
#The mean credit scores for each group do differ widely as we suspected. 
# We can go ahead and impute the missing values for the “credit_score” column using the mean credit score for each income group.

#function to impute missing values
def impute_creditscore(income_classes):
    """This function takes a list of income groups and imputes the missing values in the credit score column"""
    for income_class in income_classes:
        mask = df["income"] == income_class
        mean = df[df["income"] == income_class]['credit_score'].mean()
        df.loc[mask, 'credit_score'] = df.loc[mask, 'credit_score'].fillna(mean)

In [None]:
income_groups = ['poverty', 'middle class', 'working class', 'upper class']
impute_creditscore(income_groups)

df.isnull().sum()

In [None]:
df.groupby('driving_experience')['annual_mileage'].mean()

In [None]:
mean_mileage = df['annual_mileage'].mean()
df['annual_mileage'].fillna(mean_mileage, inplace=True)
df.isnull().sum()

In [None]:
#axis=1 is for columns, axis=0 is for rows
#inplace=True means that the changes will be made to the original dataframe

df.drop(['id', 'postal_code'], axis=1, inplace=True)

In [None]:
# Univariate analysis: Single column or variable
df['gender'].value_counts()

In [None]:
# Univariate analysis: Categorical unordered data
sns.countplot(data=df, x='gender')
plt.title("Number of clients per gender")
plt.ylabel("Number of clients")
plt.show()

In [None]:
plt.figure(figsize=(6,6))
data = df['income'].value_counts(normalize=True)
labels = ["upper class", "middle class", "working class", "poverty"]
colors = sns.color_palette('pastel')
plt.pie(data, labels=labels, colors=colors, autopct='%.0f%%')
plt.title("Proportion of clients per income group")
plt.show()

In [None]:
df['education'].value_counts()

In [None]:
# Univariate analysis: Categorical Ordered data
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='education', order=['university', 'high school', 'none'], color='skyblue')
plt.title("Number of clients per education level")
plt.show()

In [None]:
# Univariate analysis: Numerical data - The third type of univariate analysis uses numerical data. 
# Univariate numeric data is usually analyzed by calculating functions like the mean, mode, max, min, standard deviation etc.
df['credit_score'].describe()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data=df, x='credit_score', bins=40, kde=True).set(title='Distribution of Credit Score', ylabel='Number of Clients')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data=df, x='annual_mileage', bins=20, kde=True).set(title='Distribution of Annual Mileage', ylabel='Number of Clients')
plt.show()

In [None]:
# Bivariate analysis: Two variables
# Types: numerical vs numerical, numerical vs categorical, categorical vs categorical
# Numerical vs Numerical
plt.figure(figsize=(8,5))
plt.scatter(data=df, x='annual_mileage', y='speeding_violations')
plt.title('Annual Mileage vs Speeding Violations')
plt.xlabel('Annual Mileage')
plt.ylabel('Speeding Violations')
plt.show()

In [None]:
corr_matrix = df[['speeding_violations', 'DUIs', 'past_accidents']].corr()
corr_matrix

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(corr_matrix, annot=True, cmap='Reds')
plt.title('Correlation Matrix')
plt.show()

In [None]:
df.groupby('outcome')['annual_mileage'].mean()

In [None]:
# Bivariate analysis: Numerical vs Categorical
sns.boxplot(data=df, x='outcome', y='annual_mileage')
plt.title('Annual Mileage by Outcome')
plt.show()

In [None]:
sns.histplot(data=df, x='credit_score', hue='outcome', element='step', stat='density')
plt.title('Distribution of Credit Score per Outcome')
plt.show()

In [None]:
# Bivariate analysis: Categorical vs Categorical
# This type of analysis is used to analyze the relationship between two categorical variables.
# It can be done using a contingency table or a stacked bar chart.
df['claim_rate'] = np.where(df['outcome'] == True, 1, 0)
df['claim_rate'].value_counts()

In [None]:
plt.figure(figsize=(8,5))
df.groupby('age')['claim_rate'].mean().plot(kind='bar')
plt.title('Claim Rate by Age Group')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
df.groupby('vehicle_year')['claim_rate'].mean().plot(kind='bar')
plt.title('Claim Rate by Vehicle Year')
plt.show()

In [None]:
fig, axes = plt.subplots(1,2,figsize=(12,4))

for i, col in enumerate(['education', 'income']):
    sns.histplot(data=df, ax=axes[i], x=col, hue='outcome', stat='probability', multiple='fill', shrink=0.8, alpha=0.7)
    axes[i].set_title(f'Distribution of {col.capitalize()} by Outcome')

In [None]:
# Multivariate analysis: Three or more variables
# This type of analysis is used to analyze the relationship between three or more variables.

edu_income = pd.pivot_table(data=df, index='education', columns='income', values='claim_rate', aggfunc='mean')
edu_income

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(edu_income, annot=True, cmap='coolwarm')
plt.title('Claim Rate by Education and Income')
plt.show()

In [None]:
driv_married = pd.pivot_table(data=df, index='driving_experience', columns='married', values='claim_rate', aggfunc='mean')
plt.figure(figsize=(8,5))
sns.heatmap(driv_married, annot=True, cmap='coolwarm', center=0.117)
plt.title('Claim Rate by Driving Experience and Marital Status')
plt.show()

In [None]:
gender_children = pd.pivot_table(data=df,index='gender',columns='children',values='claim_rate')
plt.figure(figsize=[8,5])
sns.heatmap(gender_children,annot=True,cmap='coolwarm', center=0.117)
plt.title("Gender and Family Status")
plt.show()