# Milestone 1 - EDA and Preprocessing data


- Load the Dataset


> Make sure to include markdown-based text commenting and explaining each step you perform.


# 1 - Extraction


Required Libraries for EDA


In [1]:
# importing libraries

# Data manipulation and preprocessing
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

# Cool plotting style
plt.style.use('ggplot')
rcParams['figure.figsize'] = 12, 6

Set an option to display all columns


In [2]:
pd.set_option('display.max_columns', None)

Function to load the dataset


In [3]:
def load_data(data_path):
    # Load the data
    data = pd.read_csv(data_path)
    return data

In [4]:
data_path = 'data/fintech_data_22_52_14669.csv'
df = load_data(data_path)

# 2- EDA


Showing the first 5 rows of the dataset


In [None]:
df.head()

Showing the last 5 rows of the dataset


In [None]:
df.tail()

Getting the size of the dataset (rows, columns)


In [None]:
df.shape

Getting short summary of the dataset


In [None]:
df.info()

Getting statistical summary of the dataset


In [None]:
df.describe()

Showing correlation between the columns


In [None]:
df.isnull().sum()

In [None]:
df.corr()

#### Functions to plot the data for visualization


In [12]:
def plot_correlation_matrix(df: pd.DataFrame, figsize: tuple):
    plt.figure(figsize=figsize)
    sns.heatmap(df.corr(), 
                cbar=True, 
                annot=True, 
                square=True, 
                cmap='Spectral_r', 
                fmt='.2f', 
                linewidths=2,
                annot_kws={'size': 15})
    plt.show()

def plot_histogram(df: pd.DataFrame, column_name: str, kde, figsize: tuple):
    plt.figure(figsize=figsize)
    sns.histplot(df[column_name], kde=kde)
    plt.title(f'Distribution of {column_name}')
    plt.show()

def plot_boxplot_single_column(df: pd.DataFrame, column_name: str, figsize: tuple):
    plt.figure(figsize=figsize)
    sns.boxplot(x=column_name, data=df)
    plt.title(f'Boxplot of {column_name}')
    plt.show()

def plot_boxplot_multiple_columns(df: pd.DataFrame, 
                                  column_name1:str, 
                                  column_name2: str, 
                                  title: str, 
                                  x_label: str, 
                                  y_label: str, 
                                  figsize: tuple):
    plt.figure(figsize=figsize)
    sns.boxplot(x=column_name1, y=column_name2, data=df)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()

def plot_vertical_countplot(df: pd.DataFrame, column_name: str, figsize: tuple):
    plt.figure(figsize=figsize)
    sns.countplot(x=column_name, data=df)
    plt.title(f'Countplot of {column_name}')
    plt.show()

def plot_horizontal_countplot(df: pd.DataFrame, column_name: str, figsize: tuple):
    plt.figure(figsize=figsize)
    # sorted
    sns.countplot(y=column_name, data=df, order = df[column_name].value_counts().index)
    plt.title(f'Countplot of {column_name}')
    plt.show()

def plot_countplot_multiple_columns(df: pd.DataFrame, 
                                  column_name1:str, 
                                  column_name2: str, 
                                  title: str, 
                                  x_label: str, 
                                  y_label: str, 
                                  figsize: tuple):
    plt.figure(figsize=figsize)
    sns.countplot(data=df, x=column_name1, hue=column_name2, palette='Set2')
    plt.title(title)
    plt.xlabel(column_name1)
    plt.ylabel('Count')
    plt.legend(title=column_name2)
    plt.xticks(rotation=45)
    plt.show()


def plot_top_n_barplot(df: pd.DataFrame, n: int, title: str, figsize: tuple):
    plt.figure(figsize=figsize)
    sns.barplot(x=df.values[:n], y=df.index[:n])
    plt.title(f'Top {n} {title}')
    plt.xlabel('Count')
    plt.ylabel('Category')
    plt.show()

def plot_scatterplot(df: pd.DataFrame, x:str, y:str, figsize: tuple):
    plt.figure(figsize=figsize)
    sns.scatterplot(x=x, y=y, data=df)
    plt.title(f'{x} vs {y}')
    plt.show()

def plot_scatter_average(df: pd.DataFrame, columnName1: str, columnName2: str, figsize: tuple):
  average = df.groupby(columnName1)[columnName2].mean()
  plt.figure(figsize=figsize)
  plt.xlabel(columnName1)
  plt.ylabel(columnName2)
  plt.title(columnName2+' VS. '+columnName1)
  plt.scatter(average.index,average)
  plt.show()

Showing Relationship between features


In [None]:
sns.pairplot(df)

Studying correlation between features


In [None]:
plot_correlation_matrix(df, (12, 8))

- Q1: What is the distribution loan amount among customers?

Answer: The distribution of loan amount among customers is right-skewed, with most customers having a loan amount of around 10000. We can observe that most loans fall within a specific range indicating that customers tend to borrow moderate amounts.


In [None]:
plot_histogram(df, column_name='Loan Amount', kde=True, figsize=(12, 6))

In [None]:
plot_boxplot_single_column(df, column_name='Loan Amount', figsize=(12, 6))

- Q2: What is the top 10 employee title that issue a loan?

Answer: We can observe that the top 10 employee titles that issue loans are Teacher, Manager, Owner, Registered Nurse, Driver, Supervisor, Sales, Project Manager and Office Manager.


In [None]:
plot_top_n_barplot(df['Emp Title'].value_counts(), 10, 'Employment Title', figsize=(12, 6))

- Q3: What is the distribution of the number of years of employment among customers?

Answer: We can observe that employees with 10 years of employment are the most common among customers. The most customers that issue loan


In [None]:
plot_vertical_countplot(df, 'Emp Length', figsize=(12, 6))

- Q4: What is the distribution of the loan amount among customers with different employment lengths?

Answer: Loan amounts do not vary significantly by employment length, but there is a slight increase in loan amounts for customers with longer employment. Customers with very short employment lengths also have smaller loan amounts on average.


In [None]:
plot_boxplot_multiple_columns(df, 'Emp Length', 'Loan Amount', 'Loan Amount Distribution by Employment Length', 'Employment Length', 'Loan Amount', (12, 6))

- Q5: What is the relationship between the annual income and loan amount?

Answer: We can observe that there seems to be a positive trend, higher-income individuals generally take larger loans, yet the correlation is not strictly linear, suggesting that other factors may also influence loan amount.


In [None]:
# relationship between annual income and loan amount
plot_scatterplot(df, 'Annual Inc', 'Loan Amount', figsize=(12, 6))

- Q6: Are most of the customer's income and employment verified?

Answer: We can observe that there is a high percentage of customers whose income and employment are not verified. This could be a potential risk that these customers may not be able to repay the loan.


In [None]:
plot_vertical_countplot(df, 'Verification Status', figsize=(12, 6))

- Q7: What is the geographical distribution of customers based on state?

Answer: We can observe that the top 5 states with the highest number of borrowers are California, Texas, New York, Florida, and Illinois.


In [None]:
plot_horizontal_countplot(df, 'Addr State', figsize=(12, 12))


- Q8: What is the distribution of the loan status

Answer: We can observe that most loans are current, followed by fully paid. This indicates that most customers are able to repay their loans(i.e most of the loans are either actively being repaid or have been fully repaid)


In [None]:
plot_vertical_countplot(df, 'Loan Status', figsize=(12, 6))

- Q9: What is the distribution of the loan status based on the loan grade?

Answer: We can observe that most loans are grade B, followed by grade C. Most of the loans are current, followed by fully paid. This indicates that most customers are able to repay their loans(i.e most of the loans are either actively being repaid or have been fully repaid)


In [None]:
plot_countplot_multiple_columns(df, 'Grade', 'Loan Status', 'Loan Status by Grade', 'Grade', 'Count', (12, 6))


- Q10: Is the loan amount affected by the loan grade?

Answer: We can observe that classes E,F,G have higher loan amounts compared to other classes. This indicates that customers with lower loan grades tend to borrow more money.


In [None]:
plot_scatter_average(df, 'Grade','Loan Amount', (12, 6))

- Q11: Is the loan amount the same as the funded amount?

Answer: We can observe that the loan amount is almost the same as the funded amount. We can infer that the funded amount is the amount that the customer actually receives.


In [None]:
plot_scatterplot(df, 'Loan Amount', 'Funded Amount', figsize=(12, 6))

- Q12: Is there a relationship between the loan grade and the interest rate?

Answer: We can observe that higher grade such as (A, B, C) have lower interest rates compared to lower grades such as (D, E, F, G). This indicates that customers with higher grades are less risky and are charged lower interest rates.


In [None]:
plot_boxplot_multiple_columns(df, 'Grade', 'Int Rate', 'Interest Rate Distribution by Grade', 'Grade', 'Interest Rate', (12, 6))

In [None]:
plot_scatterplot(df, 'Grade', 'Int Rate', (12, 6))

- Q13: What is the most common purpose for taking a loan?

Answer: We can observe that the most common purpose for taking a loan is debt consolidation, followed by credit card. This indicates that most customers take loans to consolidate their debts.


In [None]:
plot_horizontal_countplot(df, 'Purpose', (12, 8))

### Observations:

- The distribution of loan amount among customers is right-skewed, with most customers having a loan amount of around 10000. We can observe that most loans fall within a specific range indicating that customers tend to borrow moderate amounts.

- The top 10 employee titles that issue loans are Teacher, Manager, Owner, Registered Nurse, Driver, Supervisor, Sales, Project Manager, and Office Manager.

- Employees with 10 years of employment are the most common among customers.

- Loan amounts do not vary significantly by employment length, but there is a slight increase in loan amounts for customers with longer employment. Customers with very short employment lengths also have smaller loan amounts on average.

- There seems to be a positive trend between annual income and loan amount, higher-income individuals generally take larger loans, yet the correlation is not strictly linear, suggesting that other factors may also influence loan amount.

- There is a high percentage of customers whose income and employment are not verified. This could be a potential risk that these customers may not be able to repay the loan.

- The top 5 states with the highest number of borrowers are California, Texas, New York, Florida, and Illinois.

- Most loans are current, followed by fully paid. This indicates that most customers are able to repay their loans(i.e most of the loans are either actively being repaid or have been fully repaid)

- Most loans are grade B, followed by grade C. Most of the loans are current, followed by fully paid.

- Classes E,F,G have higher loan amounts compared to other classes. This indicates that customers with lower loan grades tend to borrow more money.

- The loan amount is almost the same as the funded amount. We can infer that the funded amount is the amount that the customer actually receives.

- Higher grade such as (A, B, C) have lower interest rates compared to lower grades such as (D, E, F, G). This indicates that customers with higher grades are less risky and are charged lower interest rates.

- The most common purpose for taking a loan is debt consolidation, followed by credit card. This indicates that most customers take loans to consolidate their debts.


# 3 - Cleaning Data


In [None]:
df.columns

## Tidying up column names


I have renamed the columns to make them more readable and consistent, by converting them to lowercase and replacing spaces with underscores and removing special characters.


In [31]:
def clean_column_name(column_name: str):
    formatted_name = column_name.lower()
    formatted_name = formatted_name.strip()
    formatted_name = formatted_name.replace(' ', '_')
    formatted_name = ''.join(e for e in formatted_name if e.isalnum() or e == '_')

    return formatted_name

def clean_column_names(df: pd.DataFrame):
    df.columns = [clean_column_name(column) for column in df.columns]
    return df

df = clean_column_names(df)

In [None]:
df.columns

## Choose a suitable column index


I have created a function that returns the candidate columns for the index based on the uniqueness of the values of the columns. I have selected the column 'loan_id' as the index since it is unique and can be used to identify each row.


In [None]:
def index_feature_candidates(df: pd.DataFrame):
    return df.shape[0] - df.nunique()

index_feature_candidates(df)

We can see that we have two features that can be used as index so I will use the loan_id as it is more readable than customer id


In [None]:
df.set_index('loan_id')

## Observe inconsistent data


Check if there is any duplicates rows in the dataset


In [35]:
def summarize_column_values(df: pd.DataFrame, columns: list):
    """
    Summarizes the values of specified columns in a DataFrame.

    Args:
    df (pd.DataFrame): The DataFrame to summarize.
    columns (list): List of column names to summarize.

    Returns:
    pd.DataFrame: A summary DataFrame containing unique values and their counts for each specified column.
    """
    summary_list = []

    for column in columns:
        if column in df.columns:
            value_counts = df[column].value_counts(dropna=False)
            # Create a summary entry for this column
            summary_entry = {
                'Column': column,
                'Total Values': df[column].size,
                'Unique Values': value_counts.size,
                'Value Counts': value_counts.to_dict()
            }
            summary_list.append(summary_entry)

    return summary_list

In [None]:
def check_duplicate_rows(df: pd.DataFrame):
    return df.duplicated().sum()

check_duplicate_rows(df)

Check if there exists any duplicate rows without considering the loan_id and customer_id as both uniquely identify each row


In [None]:
def check_duplicate_rows_without_unique_columns(df: pd.DataFrame, columns: list):
    # Drop the specified columns temporarily for duplicate checking
    df_temp = df.drop(columns=columns)
    # Find the duplicate rows based on the remaining columns
    duplicates = df[df_temp.duplicated(keep=False)]
    return duplicates

check_duplicate_rows_without_unique_columns(df, ['loan_id', 'customer_id']).head()

There are no duplicate rows in the dataset


In [None]:
df.dtypes

In [None]:
ExpectedDataTypes = {
    "customer_id": "object",
    "emp_title": "object",
    "emp_length": "object",
    "home_ownership": "object",
    "annual_inc": "float64",
    "annual_inc_joint": "float64",
    "verification_status": "object",
    "zip_code": "object",
    "addr_state": "object",
    "avg_cur_bal": "float64",
    "tot_cur_bal": "float64",
    "loan_id": "int64",
    "loan_status": "object",
    "loan_amount": "float64",
    "state": "object",
    "funded_amount": "float64",
    "term": "object", 
    "int_rate": "float64",
    "grade": "int64",
    "issue_date": "object",
    "pymnt_plan": "bool",
    "type": "object",
    "purpose": "object",
    "description": "object"
}

def check_column_data_types(df: pd.DataFrame, expected_data_types=ExpectedDataTypes):
    for column in df.columns:
        actual_type = df[column].dtype
        expected_type = expected_data_types.get(column)
        if expected_type is None:
            print(f'Failure: No expected data type for column {column}')
        elif actual_type != expected_type:
            print(f'Failure :Column {column} has data type {actual_type} but expected {expected_type}')
        else:
            print(f'Column {column} has expected data type {expected_type}')


check_column_data_types(df)

Check for negative values in the dataset as columns that are numeric such as annual_inc, annual_inc_joint, avg_cur_bal, tot_cur_bal, loan_id, loan_amount, funded_amount, int_rate, and grade should not have negative values.


In [40]:
def check_negative_numbers_in_numeric_columns(df: pd.DataFrame):
    for column in df.columns:
        if df[column].dtype in ['int64', 'float64']:
            if df[column].lt(0).sum() > 0:
                print(f'Column {column} has {df[column].lt(0).sum()} negative values.')

check_negative_numbers_in_numeric_columns(df)

There are no negative values in the dataset


Check if the columns of type object hold any numeric values


In [None]:
def check_numeric_in_object_columns(df: pd.DataFrame):
    res = []
    for column in df.columns:
        if df[column].dtype == 'object':
            if df[column].str.isnumeric().sum() > 0:
                print(f'Column {column} has {df[column].str.isnumeric().sum()} numeric values.')
                print(column)
                res += [column]

    return res

check_numeric_in_object_columns(df)

In [None]:
"Ask shown below, the emp_length column has numeric values"
test = df.dropna(subset=['emp_title'])
test[test['emp_title'].str.isnumeric()]

Getting the value counts of the columns of type object to detect any irrelevant or incorrect data, different spelling with the same meaning


In [43]:
def get_categorical_columns(df: pd.DataFrame):
    categorical_value_counts = {}

    for column in df.columns:
        if column in ['loan_id', 'customer_id']:
            continue
        if df[column].dtype == 'object' or df[column].dtype == 'bool':
            categorical_value_counts[column] = df[column].value_counts()
    
    return categorical_value_counts

In [44]:
pd.set_option('display.max_rows', None)

In [None]:
columns_to_standardize = ['emp_title', 'home_ownership', 'verification_status','type']

summarize_column_values(df, columns_to_standardize)

function to make the values of each column consistent by converting them to lowercase and capitalizing the first letter of each word


In [46]:
def standardize_values_proper_case(df: pd.DataFrame, columns: list):
    for column in columns:
        # if value is null or nan skip
        df[column] = df[column].apply(lambda x: ' '.join([word.capitalize() for word in str(x).split()]) if pd.notnull(x) else x)
    
    return df

columns_to_standardize = ['emp_title', 'home_ownership', 'verification_status', 'type']

df = standardize_values_proper_case(df, columns_to_standardize)

In [None]:
summarize_column_values(df, columns_to_standardize)

As we can all the columns have consistent values where each value is capitalized and the rest are lowercase


In [None]:
df.home_ownership.value_counts()

In [None]:
df.verification_status.value_counts()

Observe if we can merge both source verified and verified into one class


Observing the 3 classes among the grades of the customer


In [None]:
plot_countplot_multiple_columns(df, 'loan_status', 'verification_status', 'Relationship Between Verification Status and Grade', 'Grade', 'Count', (12, 6))

In [None]:
plot_boxplot_multiple_columns(df, 'verification_status', 'int_rate', '', 'verification_status', 'int_rate', (12, 6))

In [None]:
plot_countplot_multiple_columns(df, 'loan_status', 'verification_status', 'Relationship Between Verification Status and loan_status', 'loan_status', 'Count', (12, 6))

In [None]:
df.loan_status.value_counts()

In [None]:
df.grade.value_counts().sort_index()

In [None]:
summarize_column_values(df, ['type'])

As we can see in the column 'type' Joint App and Joint are the same so I will replace Joint App with Joint and Direct_pay with Direct Pay


In [56]:
def change_column_values_to_mapped_values(df: pd.DataFrame, column: str, mapping_dict: dict):
    df[column] = df[column].map(mapping_dict)
    return df

type_column_map = {
    "Individual": "Individual",
    "Joint": "Joint",
    "Joint App": "Joint",
    "Direct_pay": "Direct Pay"
}

df = change_column_values_to_mapped_values(df, 'type', type_column_map)

In [None]:
summarize_column_values(df, ['type'])

In [None]:
summarize_column_values(df, ['issue_date'])

For the issue date, we need to check contain valid number of days in each month and year


In [None]:
def validate_date_values(df: pd.DataFrame, column: str):
    invalid_dates = []
    for idx, date in enumerate(df[column]):
        try:
            pd.to_datetime(date)
        except:
            invalid_dates.append((idx, date))
        
    return invalid_dates

validate_date_values(df, 'issue_date')

In [None]:
df.nunique()

As we can see all values of the issue date are valid, what we did we looped over the values of the issue date and tried to cast to datetime if it fails we will append the location of the value to the invalid_dates list


## Findings and conclusions


## Observing Missing Data


In [None]:
def get_null_count_and_percentage(df: pd.DataFrame, percentage=False):
    """
    Returns the count or percentage of missing values for each column with null entries.

    Args:
    df (pd.DataFrame): The DataFrame to analyze.
    percentage (bool): If True, returns the percentage of null values per column. Defaults to False.

    Returns:
    pd.Series: A Series showing either the count or percentage of missing values for columns with null values.
    """
    null_cnt = df.isnull().sum()

    null_cnt = null_cnt[null_cnt > 0]
    if percentage:
        null_cnt = null_cnt / len(df) * 100
    return null_cnt

get_null_count_and_percentage(df), get_null_count_and_percentage(df, percentage=True)    

In [None]:
def check_for_non_standard_missing_values(df: pd.DataFrame, columns: list):
    """
    Checks for non-standard missing values in the specified columns of a dataframe.

    Args:
    df (pd.DataFrame): The dataframe to check for non-standard missing values.
    columns (list): List of column names to check for non-standard missing values.

    Returns:
    dict: A dictionary where each key is a column name, and each value is a dictionary containing:
          - 'values': A list of unique non-standard missing values found.
          - 'count': The count of occurrences for non-standard missing values in that column.
    """
    # Define the non-standard missing values to search for
    non_standard_missing_values = ["na", "n/a", "missing", "none", "nan", "null", "nil"]
    
    # Store found values and counts in a dictionary
    missing_values_dict = {}
    
    for column in columns:
        # Initialize sets for non-standard missing values and their counts
        found_values = set()
        count = 0
        
        # Check for each non-standard missing value in the column
        for value in non_standard_missing_values:
            # Filter for exact matches using case-insensitive comparison
            matches = df[df[column].astype(str).str.lower() == value]
            
            # Add unique matches to found_values and update the count
            found_values.update(matches[column].unique())
            count += len(matches)
        
        # Store the unique values and their count for this column
        if found_values:
            missing_values_dict[column] = {'values': list(found_values), 'count': count}
    
    return missing_values_dict

check_for_non_standard_missing_values(df, df.columns)


In [None]:
get_null_count_and_percentage(df).sort_values(ascending=False)

We have 5 columns with missing data which are annual_inc_joint, emp_title, emp_length, int_rate, and description. We will handle the missing data in the next step.


In [239]:
# Function to plot missing data heatmap
def plot_missing_data_heatmap(df: pd.DataFrame):
    """
    Plots a heatmap of missing data in the dataframe.

    Parameters:
    df (pd.DataFrame): Dataframe to analyze.
    """
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
    plt.title('Missing Data Heatmap')
    plt.xlabel('Columns')
    plt.ylabel('Rows')
    plt.show()

def draw_correlation_map(df: pd.DataFrame, values_interest: list):
    correlation_matrix = df[values_interest].corr()
    plt.figure(figsize=(8, 6))

    # Create the heatmap
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

    # Customize the plot
    plt.xlabel('Variables')
    plt.ylabel('')

    # Show the plot
    plt.show() 

# Function to analyze correlation of missing data with other attributes
def analyze_missing_correlation(df: pd.DataFrame, target_column: str):
    """
    Analyzes the correlation between the missing status of the target column and other columns in the dataframe.

    Parameters:
    df (pd.DataFrame): Dataframe to analyze.
    target_column (str): Column name to analyze the missing data correlation.

    Returns:
    pd.DataFrame: Correlation dataframe with missing indicator for the target column.
    """
    # Create a missing indicator for the target column
    df_missing_indicator = df.copy()
    df_missing_indicator[target_column + '_missing'] = df[target_column].isnull().astype(int)
    
    # Calculate correlations with the missing indicator
    missing_correlation = df_missing_indicator.corr()[target_column + '_missing'].sort_values(ascending=False)
    
    # Plot the correlation heatmap with missing indicator
    plt.figure(figsize=(10, 6))
    sns.heatmap(missing_correlation.to_frame(), annot=True, cmap='coolwarm', cbar=True)
    plt.title(f'Correlation of {target_column} Missing Indicator with Other Features')
    plt.show()

    return missing_correlation

def plot_missing_vs_categoricals(df: pd.DataFrame, target_column: str, cat_columns: list):
    """
    Plots the relationship between a missing attribute and multiple categorical attributes, each in its own row.

    Parameters:
    df (pd.DataFrame): DataFrame to analyze.
    target_column (str): Column with missing values to analyze.
    cat_columns (list): List of categorical columns to compare the missing values with.
    """
    # Create a missing indicator for the target column
    df_missing = df.copy()
    df_missing[target_column + '_missing'] = df[target_column].isnull().astype(int)
    
    # Set the number of plots
    num_cols = len(cat_columns)

    plt.figure(figsize=(10, 5 * num_cols))  # Adjust height based on the number of columns

    # Plot each categorical column in a separate row
    for i, cat_column in enumerate(cat_columns):
        plt.subplot(num_cols, 1, i + 1)  # Create a subplot for each categorical column in one column
        sns.barplot(x=cat_column, y=target_column + '_missing', data=df_missing, ci=None)
        plt.title(f'Relationship between {target_column} Missing Indicator and {cat_column}')
        plt.ylabel(f'Proportion of Missing in {target_column}')
        plt.xlabel(cat_column)
        plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

def plot_categorical_vs_categorical(df: pd.DataFrame, cat_column1: str, cat_column2: str):
    """
    Plots the relationship between two categorical attributes.

    Parameters:
    df (pd.DataFrame): DataFrame to analyze.
    cat_column1 (str): First categorical column to analyze.
    cat_column2 (str): Second categorical column to compare with.
    """
    plt.figure(figsize=(14, 6))

    # Count plot
    plt.subplot(1, 2, 1)
    sns.countplot(x=cat_column1, hue=cat_column2, data=df, palette="viridis")
    plt.title(f'Count of {cat_column1} by {cat_column2}')
    plt.ylabel('Count')
    plt.xlabel(cat_column1)
    plt.xticks(rotation=45)
    plt.legend(title=cat_column2)

    # Stacked Bar Plot
    counts = df.groupby([cat_column1, cat_column2]).size().unstack(fill_value=0)
    counts.plot(kind='bar', stacked=True, ax=plt.subplot(1, 2, 2), colormap='viridis')
    plt.title(f'Stacked Bar Plot of {cat_column1} by {cat_column2}')
    plt.ylabel('Count')
    plt.xlabel(cat_column1)
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

def plot_correlations(df, target_column, reference_columns):
    """
    Plots the correlation between a target column and multiple reference columns.
    
    Parameters:
        df (pd.DataFrame): The data frame to analyze.
        target_column (str): The column to plot correlations against.
        reference_columns (list): A list of columns to compare with the target column.
        
    Returns:
        None: Shows the plots.
    """
    # Set up the matplotlib figure
    plt.figure(figsize=(15, 5 * len(reference_columns)))
    
    # Iterate over each reference column to plot correlation
    for i, ref_col in enumerate(reference_columns, start=1):
        plt.subplot(len(reference_columns), 1, i)
        
        if pd.api.types.is_numeric_dtype(df[ref_col]):
            # Scatter plot for numeric reference columns
            sns.scatterplot(data=df, x=ref_col, y=target_column)
            plt.title(f'Scatter Plot: {target_column} vs {ref_col}')
            plt.xlabel(ref_col)
            plt.ylabel(target_column)
        
        elif pd.api.types.is_categorical_dtype(df[ref_col]) or df[ref_col].dtype == 'object':
            # Box plot for categorical reference columns
            sns.boxplot(data=df, x=ref_col, y=target_column)
            plt.title(f'Box Plot: {target_column} vs {ref_col}')
            plt.xlabel(ref_col)
            plt.ylabel(target_column)
            plt.xticks(rotation=45)
        
        else:
            print(f"Column {ref_col} has unsupported data type for plotting. Skipping...")
    
    plt.tight_layout()
    plt.show()

In [213]:
def visualize_cols_values_cnt(dataset, col_name, percentage=False, title=None):
    (dataset[col_name].value_counts(dropna=False) * ((100/len(dataset)) if percentage else 1) ).plot(kind='bar', xlabel=col_name, ylabel='count', title=title)

In [None]:
plot_missing_data_heatmap(df)


#### Let's start by investigating the missing data in the annual_inc_joint column


In [None]:
# calculate the count of null values of annual_inc_joint in each loan type
df.groupby('type')['annual_inc_joint'].apply(lambda x: x.isnull().sum()) 

There are no missing values in the annual_inc_joint with the loan type being joint


In [None]:
individual_count = df[df['type'] == 'Individual']['type'].count()
direct_pay_count =df[df['type'] == 'Direct Pay']['type'].count()

individual_count+direct_pay_count == df[df['annual_inc_joint'].isnull()]['type'].count()


In [None]:
df['annual_inc_joint'].describe()

Let's talk about the loan types in the dataset. We have 3 types of loans which are:

#### 1. Individual Loan

**Definition:**  
An individual loan is taken out by a single borrower, who is solely responsible for repaying the loan.

---

#### 2. Joint Loan

**Definition:**  
A joint loan is applied for and signed by two or more borrowers, such as spouses or business partners. All individuals on the loan are responsible for repaying it.

---

#### 3. Direct Pay Loan

**Definition:**  
In a direct pay loan, the lender pays the funds directly to the institution or organization on behalf of the borrower, rather than disbursing the funds to the borrower.

---

#### We have observed that the annual_inc_joint column is missing for all individual and direct_pay loans. This is because these types of loans do not have a joint applicant.


### **Annual Inc Joint Column**: This data is missing not at random (MNAR) as the missing values are dependent on the loan type.

- We can fill the missing values in the annual_inc_joint column with 0 for individual and direct_pay loans.
- As 0 will act as special character indicating the loan_type is individual or direct_pay loan which does not have a joint applicant.


#### Let's investigate the missing data in the empl_title column


In [None]:
df['emp_title'].isna().sum(), df['emp_title'].isna().mean()*100

- The number of missing values in the emp_title column is 2376.
- The percentage of missing values in the emp_title column is 8.8%.


In [None]:
df[df['emp_title'].isna()].head(10)

In [None]:
df[df['emp_title'].isna()].tail(10)

In [None]:
df['emp_title'].isna().groupby(df['emp_length']).mean()*100

In [None]:
df[df['emp_title'].isna()].groupby('emp_length')['emp_length'].count()

In [None]:
mode_emp_title_by_length = df.groupby('emp_length')['emp_title'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
mode_emp_title_by_length

In [None]:
df[df['emp_length'] == '< 1 year'].shape

In [None]:
plot_missing_vs_categoricals(df, 'emp_title', ['emp_length', 'home_ownership', 'verification_status', 'addr_state', 'loan_status', 'state', 'type', 'grade'])

#### Let's investigate the missing data in the emp_length column


In [None]:
df[df['emp_length'].isna()].head()

In [None]:
df[df['emp_length'].isna()].tail(10)

#### Let's investigate the missing data in the int_rate column


In [None]:
df['int_rate'].isna().sum(), df['int_rate'].isna().mean()*100

- The number of missing values in the int_rate column is 1211.
- The percentage of missing values in the int_rate column is 4.48%.


In [None]:
df[df['int_rate'].isna()].head(10)

In [None]:
df[df['int_rate'].isna()].tail(10)

In [None]:
df.int_rate.isna().groupby(df['grade']).mean()*100

In [None]:
df.int_rate.isna().groupby(df['loan_status']).mean()*100

In [None]:
df.int_rate.isna().groupby(df['term']).mean()*100

In [None]:
analyze_missing_correlation(df, 'int_rate')

In [None]:
plot_correlations(df, 'int_rate', ['grade', 'loan_amount', 'annual_inc', 'loan_status', 'verification_status', 'term'])

In [None]:
plot_missing_vs_multiple_categoricals(df, 'int_rate', ['loan_status', 'grade', 'type', 'verification_status', 'term', 'purpose'])

### **Int rate Column**: This data is missing completely at random as there are no relationship between the data missing and any other values, observed or missing, within the dataset. I believe it is missing because it was not recorded. Loans must be associated with an interest rate, so it is important to fill in the missing values.

- We can fill the missing values in the int_rate column with the mean value of the column grouped by the loan grade. As there is a strong relationship between the loan grade and the interest rate, we can use the mean interest rate for each loan grade to fill in the missing values.


#### Let's investigate the missing data in the description column


In [None]:
df['description'].isna().sum(), df['description'].isna().mean()*100

- The number of missing values in the int_rate column is 218.
- The percentage of missing values in the int_rate column is 0.8%.


In [None]:
df[df['description'].isna()].head(10)

In [None]:
df[df['description'].isna()].tail(10)

In [None]:
df.description.isna().groupby(df['purpose']).mean()*100

In [None]:
df.description.isna().groupby(df['home_ownership']).mean()*100

In [None]:
plot_missing_vs_categoricals(df, 'description', ['purpose'])

### **Description Column**: This data is missing completely at random as there are no relationship between the data missing and any other values, observed or missing, within the dataset. Also the description may was left as optional field so many customers may not have filled it. The percentage of the missed values are very low (0.8%)

- We can fill the missing values in the description by grouping the data by the purpose of the loan and filling the missing values with the most common description for that purpose.


## Handling Missing data


## Findings and conclusions


## Observing outliers


## Handling outliers


## Findings and conclusions


# 4 - Data transformation and feature eng.


## 4.1 - Adding Columns


## 4.2 - Encoding


## 4.22 - Findings and conlcusions


## 4.3 - Normalization


## 4.31 - Findings and conclusions


# 5 - Lookup Table(s)


# 6 - Bonus ( Data Integration )


## 5- Exporting the dataframe to a csv file or parquet
