In [1]:
import pandas as pd
import pickle
import datetime

In [2]:
import sys
sys.path.insert(0, './utility/')
import data_preprocessing as dp

In [3]:
import importlib
importlib.reload(dp)

<module 'data_preprocessing' from './utility\\data_preprocessing.py'>

In [4]:
data_path = './data/loan.csv'

df = pd.read_csv(data_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.shape

(887379, 74)

In [6]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
2,1077175,1313524,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,
3,1076863,1277178,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,
4,1075358,1311748,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,


In [7]:
# Assign labels based on loan status

excluded = {'Issued', 'Does not meet the credit policy. Status:Charged Off',
            'Does not meet the credit policy. Status:Fully Paid'}
positive = {'Current', 'Fully Paid'}
negative = {'Charged Off', 'Default', 'In Grace Period', 
       'Late (16-30 days)', 'Late (31-120 days)'}

def assign_label(status):
    """
    Classify loan staus based on description
    """
    try: 
        status=status.strip()
        if status in positive:
            return "positive"
        elif status in negative:
            return "negative"
        else: 
            return "excluded"
    except: 
        return "excluded"

In [8]:
df['label'] = df['loan_status'].apply(lambda x: assign_label(x))

In [9]:
df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

In [10]:
# Drop ids, and loan status to prevent info leakage
cols_to_drop = ['id', 'member_id', 'loan_status']
df_id_removal = df.drop(columns = cols_to_drop)

In [11]:
# Only keep loans with good or bad status
df_label = df_id_removal[df_id_removal['label']!='excluded']
df_label.label.unique()

array(['positive', 'negative'], dtype=object)

In [12]:
df_processed = dp.preprocess_data(df_label, label = 'label')

INFO:root:Step 1: Dropping columns with missing values above the threshold
INFO:root:Step 2: Processing date columns and renaming them
INFO:root:Step 3: Handling missing values for numerical columns
INFO:root:Step 4: Handling extreme values for numerical columns
INFO:root:Step 5: Creating dummy variables for categorical columns
INFO:root:Step 6: Standardizing numerical columns
INFO:root:Step 7: Handling high correlation among numerical columns


In [13]:
df_processed.columns

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc',
       ...
       'home_ownership_nan', 'verification_status_Source Verified',
       'verification_status_Verified', 'verification_status_nan',
       'term_ 60 months', 'term_nan', 'next_pymnt_d_Jan-2016',
       'next_pymnt_d_Mar-2016', 'next_pymnt_d_nan', 'label'],
      dtype='object', length=117)

In [14]:
# Create a timestamped filename
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"./data/preprocessed_data_{current_time}.pickle"
output_filename

'./data/preprocessed_data_20230420_125002.pickle'

In [15]:
# Save the processed DataFrame to a pickle file
df_processed.to_pickle(output_filename)

print(f"Processed data saved to {output_filename}")

Processed data saved to ./data/preprocessed_data_20230420_125002.pickle
