In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/siglimumuni/Datasets/master/customer-data.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.groupby('income')['credit_score'].mean()

In [None]:
#The mean credit scores for each group do differ widely as we suspected. 
# We can go ahead and impute the missing values for the “credit_score” column using the mean credit score for each income group.

#function to impute missing values
def impute_creditscore(income_classes):
    """This function takes a list of income groups and imputes the missing values in the credit score column"""
    for income_class in income_classes:
        mask = df["income"] == income_class
        mean = df[df["income"] == income_class]['credit_score'].mean()
        df.loc[mask, 'credit_score'] = df.loc[mask, 'credit_score'].fillna(mean)

In [None]:
income_groups = ['poverty', 'middle class', 'working class', 'upper class']
impute_creditscore(income_groups)

df.isnull().sum()

In [None]:
df.groupby('driving_experience')['annual_mileage'].mean()

In [None]:
mean_mileage = df['annual_mileage'].mean()
df['annual_mileage'].fillna(mean_mileage, inplace=True)
df.isnull().sum()

In [None]:
#axis=1 is for columns, axis=0 is for rows
#inplace=True means that the changes will be made to the original dataframe

df.drop(['id', 'postal_code'], axis=1, inplace=True)

In [None]:
# Univariate analysis: Single column or variable
df['gender'].value_counts()

In [None]:
# Univariate analysis: Categorical unordered data
sns.countplot(data=df, x='gender')
plt.title("Number of clients per gender")
plt.ylabel("Number of clients")
plt.show()

In [None]:
plt.figure(figsize=(6,6))
data = df['income'].value_counts(normalize=True)
labels = ["upper class", "middle class", "working class", "poverty"]
colors = sns.color_palette('pastel')
plt.pie(data, labels=labels, colors=colors, autopct='%.0f%%')
plt.title("Proportion of clients per income group")
plt.show()

In [None]:
df['education'].value_counts()

In [None]:
# Univariate analysis: Categorical Ordered data
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='education', order=['university', 'high school', 'none'], color='skyblue')
plt.title("Number of clients per education level")
plt.show()

In [None]:
# Univariate analysis: Numerical data - The third type of univariate analysis uses numerical data. 
# Univariate numeric data is usually analyzed by calculating functions like the mean, mode, max, min, standard deviation etc.
df['credit_score'].describe()