In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
custloan_df = pd.read_csv(r'C:\Users\tde_v\CourseAI\loan_payments.csv')

class DataFrameInfo:
    def __init__(self, df):
        self.df = df

    def describe_columns(self):
        "Describe all columns in the DataFrame to check their data types"
        print("Column Descriptions:")
        print(self.df.dtypes)
        print("\n")

    def statistical_summary(self):
        "Extract statistical values: median, standard deviation and mean from the columns and the DataFrame"
        print("Statistical Summary:")
        print("Mean:\n", self.df.mean(numeric_only=True))
        print("\nMedian:\n", self.df.median(numeric_only=True))
        print("\nStandard Deviation:\n", self.df.std(numeric_only=True))
        print("\n")

    def count_distinct_values(self):
        "Count distinct values in categorical columns"
        categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns
        print("Distinct Values in Categorical Columns:")
        for col in categorical_cols:
            print(f"{col}: {self.df[col].nunique()} distinct values")
        print("\n")

    def print_shape(self):
        "Print out the shape of the DataFrame"
        print(f"The DataFrame has {self.df.shape[0]} rows and {self.df.shape[1]} columns.\n")

    def null_value_summary(self):
        "Generate a count/percentage count of NULL values in each column"
        print("NULL Value Summary:")
        null_counts = self.df.isnull().sum()
        null_percentage = (null_counts / len(self.df)) * 100
        null_summary = pd.DataFrame({
            'Null Count': null_counts,
            'Null Percentage': null_percentage
        })
        print(null_summary)
        print("\n")

class Plotter:
    def __init__(self, df):
        self.df = df

    def plot_nulls(self, null_summary_before, null_summary_after):
        "Visualise removal of NULL values with a bar plot."
        plt.figure(figsize=(12, 6))
        width = 0.20
        indices = np.arange(len(null_summary_before))

        plt.bar(indices, null_summary_before, width, label='Before NULL Removal', color='skyblue')
        plt.bar(indices + width, null_summary_after, width, label='After NULL Removal', color='salmon')

        plt.xticks(indices + width / 2, self.df.columns, rotation=90)
        plt.ylabel('Number of NULLs')
        plt.title('Comparison of NULL Counts Before and After Removal')
        plt.legend()
        plt.tight_layout()
        plt.show()

class DataFrameTransform:
    def __init__(self, df):
        self.df = df

    def drop_high_null_columns(self, threshold=50):
        "Drop columns with a NULL percentage above the threshold."
        null_percentage = (self.df.isnull().sum() / len(self.df)) * 100
        columns_to_drop = null_percentage[null_percentage > threshold].index
        self.df.drop(columns=columns_to_drop, inplace=True)
        print(f"Dropped columns with >{threshold}% NULLs: {list(columns_to_drop)}\n")

    def impute_nulls(self):
        "Impute NULL values with the mean or median."
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if self.df[col].isnull().sum() > 0:
                # Use median for skewed columns and mean for others
                if abs(self.df[col].skew()) > 1:
                    self.df[col].fillna(self.df[col].median(), inplace=True)
                    print(f"Imputed column {col} with median.")
                else:
                    self.df[col].fillna(self.df[col].mean(), inplace=True)
                    print(f"Imputed column {col} with mean.")
        print("\n")