In [None]:
"""Import necessary libraries and the 'DataTransform' class:"""
import pandas as pd
from data_transform import DataTransform
from dataframe_info import DataFrameInfo
from missing_values import Plotter, DataFrameTransform
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
"""Load the dataset"""
loan_data = pd.read_csv('loan_payments.csv')

In [None]:
"""Initialise the 'DataTransform class"""
transformer = DataTransform(loan_data)

In [None]:
"""Transform the required columns to have a category datatype."""
transformer.convert_to_category(['grade', 'sub_grade', 'home_ownership', 'verification_status', 'loan_status', 'purpose', 'term', 'employment_length', ])

In [None]:
"""Transform the required columns to have a datetime datatype."""
transformer.convert_to_datetime(['issue_date', 'earliest_credit_line', 'last_payment_date', 'next_payment_date', 'last_credit_pull_date'])

In [None]:
print(loan_data.dtypes)

In [None]:
print(loan_data['issue_date'].unique())

In [None]:
df_info = DataFrameInfo(loan_data)

In [None]:
df_info.describe_columns()
print(df_info.statistical_summary())
print("Median:\n", df_info.get_median())
print("Standard Deviation:\n", df_info.get_standard_deviation())
print("Mean:\n", df_info.get_mean())
print("Distinct Values Count:\n", df_info.count_distinct_values())
print("DataFrame Shape:\n", df_info.get_shape())
print("Null Values Count:\n", df_info.count_null_values())
print("Null Values Percentage:\n", df_info.count_null_values_percentage())

In [None]:
plotter = Plotter(loan_data)
transformer = DataFrameTransform(loan_data)

In [None]:
null_counts = transformer.check_nulls()
print(null_counts)

In [None]:
transformer.drop_columns()
null_counts_after_dropping = transformer.check_nulls()
print(null_counts_after_dropping)

In [None]:
transformer.impute_nulls()
null_counts_after_imputation = transformer.check_nulls()
print(null_counts_after_imputation)

In [None]:
skewed_columns = transformer.calculate_skewness(threshold = 1)


In [None]:
best_transformations = transformer.transform_skewed_columns(skewed_columns)
print("Best transformations for each column:")
print(best_transformations)

In [None]:
for column, method in best_transformations.items():
    transformer.df[column] = transformer._transform_column(transformer.df[column], method)


# Visualize skewness after transformation
for column in best_transformations:
    plotter.plot_skewness(column)

In [None]:
# Save transformed DataFrame
transformed_df = transformer.df
transformed_df.to_csv('transformed_data.csv', index=False)

In [None]:
plotter = Plotter(loan_data)
transformer = DataFrameTransform(loan_data)

In [None]:
# Decide whether to remove or transform outliers based on skewness and summary statistics
df_cleaned = transformer.decide_outlier_handling(skewed_columns)

# Re-initialize classes with cleaned DataFrame
plotter_cleaned = Plotter(df_cleaned)

In [None]:
# Visualize outliers after removal or transformation
# plotter_cleaned.plot_outliers(skewed_columns)


In [None]:
df_cleaned.to_csv('cleaned_dataset.csv', index=False)


In [None]:
plotter.plot_correlation_matrix()

In [None]:
# Step 2: Identify and remove highly correlated columns
highly_correlated_columns = transformer.remove_highly_correlated_columns(threshold=0.9)

# Step 3: Visualize the correlation matrix after removing highly correlated columns
plotter.plot_correlation_matrix()

# Step 3: Create a new Plotter instance with the updated DataFrame
plotter = Plotter(transformer.df)

# Step 4: Visualize the correlation matrix after removing highly correlated columns
plotter.plot_correlation_matrix()

In [None]:
transformer = DataFrameTransform('cleaned_dataset.csv')
plotter = Plotter('cleaned_dataset.csv')

In [None]:
# recovery_rate, total_funded_amount, total_recovery_amount = transformer.calculate_recovery_rate()
# print(f"Percentage of loans recovered against investor funding: {recovery_rate:.2f}%")
# plotter.plot_recovery_rate(total_recovery_amount, total_funded_amount)

In [None]:
try:
    recovery_rate, total_funded_amount, total_recovery_amount = transformer.calculate_recovery_rate()
    print(f"Percentage of loans recovered against investor funding: {recovery_rate:.2f}%")
    plotter.plot_recovery_rate(total_recovery_amount, total_funded_amount)
except (KeyError, TypeError) as e:
    print(e)

# Calculate and visualize projected recovery up to 6 months
try:
    projected_recovery_percentage_6_months, total_recoverable_amount, projected_recovery_6_months = transformer.calculate_projected_recovery_6_months()
    print(f"Projected percentage of total amount recovered up to 6 months in the future: {projected_recovery_percentage_6_months:.2f}%")
    plotter.plot_projected_recovery_6_months(projected_recovery_6_months, total_recoverable_amount)
except (KeyError, TypeError) as e:
    print(e)

In [None]:
df = pd.read_csv('cleaned_dataset.csv')

# Verify the column names in the DataFrame
print(df.columns)

# Verify the DataFrame structure and data types
print(df.info())
print(df.head())

# Initialize the transformer and plotter
transformer = DataFrameTransform(df)
plotter = Plotter(df)


In [None]:
print(df['term'].dtype)

In [None]:
# Assuming df is your loaded DataFrame
transformer = DataFrameTransform(df)

# Calculate current recovery rates
recovery_rate, total_funded_amount, total_recovery_amount = transformer.calculate_recovery_rate()

print(total_recovery_amount)

# Calculate percentage of loans recovered against investor funding
percentage_recovered = (total_recovery_amount / total_funded_amount) * 100

print(f"Percentage of loans recovered against investor funding: {percentage_recovered:.2f}%")

# Calculate projected recovery rates up to 6 months into the future
projected_recovery_percentage_6_months, total_recoverable_amount, projected_recovery_6_months = transformer.calculate_projected_recovery_6_months()

# Visualize using Plotter class
plotter = Plotter(df)

# Visualize current recovery rates
plotter.plot_recovery_breakdown(total_recovery_amount, total_funded_amount)

# Prepare data for future recovery rate visualization (assuming you have monthly data)
monthly_data = pd.DataFrame({
    'date': ['2024-07-01', '2024-08-01', '2024-09-01', '2024-10-01', '2024-11-01', '2024-12-01'],
    'recovery_rate': [recovery_rate, recovery_rate, recovery_rate, recovery_rate, recovery_rate, projected_recovery_percentage_6_months]
})

# Visualize recovery rate over time
plotter.plot_recovery_rate_over_time(monthly_data)

# Visualize predicted recovery percentage up to 6 months into the future
plotter.plot_predicted_recovery([recovery_rate] * 5 + [projected_recovery_percentage_6_months])


In [None]:
# Calculate recovery rates
recovery_rate, total_funded_amount, total_recovery_amount = transformer.calculate_recovery_rate()

# Plotting the recovery rate as a pie chart
plotter.plot_recovery_rate(total_recovery_amount, total_funded_amount)


In [None]:
print(f"total_recovery_amount: {total_recovery_amount}, type: {type(total_recovery_amount)}")
print(f"total_funded_amount: {total_funded_amount}, type: {type(total_funded_amount)}")


In [None]:
# Plotting the recovery rate as a pie chart
sizes = [total_recovery_amount, total_funded_amount - total_recovery_amount]
labels = ['Recovered Amount', 'Remaining Funded Amount']
colors = ['#ff9999','#66b3ff']

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.title('Recovery Rate: Percentage of loans recovered against investor funding')
plt.show()
