## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

## Import Data

In [None]:
df = pd.read_csv('../input/d/ashishgupta0698/projectdata/loan.csv')

In [None]:
df.head()

In [None]:
df.info(verbose=True, null_counts=True)

## Drop all null columns

In [None]:
df.dropna(axis=1, how='all', inplace=True)

## Below listed are columns which are already clean

In [None]:
df['last_pymnt_amnt'].value_counts()

In [None]:
df['total_pymnt_inv'].value_counts()

In [None]:
df['total_rec_prncp'].value_counts()

In [None]:
df['total_rec_int'].value_counts()

In [None]:
df['total_rec_late_fee'].value_counts()

In [None]:
df['recoveries'].value_counts()

In [None]:
df['collection_recovery_fee'].value_counts()

In [None]:
df['purpose'].value_counts()

In [None]:
df['zip_code'].value_counts()

In [None]:
df['addr_state'].value_counts()

In [None]:
df['dti'].value_counts()

In [None]:
df['delinq_2yrs'].value_counts()

In [None]:
df['inq_last_6mths'].value_counts()

In [None]:
df['open_acc'].value_counts()

In [None]:
df['pub_rec'].value_counts()

In [None]:
df['revol_bal'].value_counts()

In [None]:
df['total_acc'].value_counts()

In [None]:
df['out_prncp'].value_counts()

In [None]:
df['out_prncp_inv'].value_counts()

In [None]:
df['installment'].value_counts()

In [None]:
df['verification_status'].value_counts()

In [None]:
df['loan_status'].value_counts()

## These columns are redundant and are dropped

In [None]:
df['policy_code'].value_counts()

In [None]:
df['application_type'].value_counts()

In [None]:
df['acc_now_delinq'].value_counts()

In [None]:
df['delinq_amnt'].value_counts()

In [None]:
df['initial_list_status'].value_counts()

In [None]:
df['pymnt_plan'].value_counts()

In [None]:
df['collections_12_mths_ex_med'].value_counts(dropna=False)

In [None]:
df['chargeoff_within_12_mths'].value_counts(dropna=False)

In [None]:
df['tax_liens'].value_counts(dropna=False)

In [None]:
df['next_pymnt_d'].value_counts(dropna=False)

In [None]:
df.drop(['collections_12_mths_ex_med','chargeoff_within_12_mths','tax_liens','id','member_id','next_pymnt_d'], axis=1, inplace=True)
df.drop('pymnt_plan', axis=1, inplace=True)
df.drop('initial_list_status', axis=1, inplace=True)
df.drop('policy_code', axis=1, inplace=True)
df.drop('application_type', axis=1, inplace=True)
df.drop('acc_now_delinq', axis=1, inplace=True)
df.drop('delinq_amnt', axis=1, inplace=True)

## Cleaning percentages

In [None]:
df['int_rate']

In [None]:
def clean_percentages(i):
    i = i.str.replace('%', '')
    i = i.astype('float64')
    i = i/100
    return i

In [None]:
df['int_rate'] = clean_percentages(df['int_rate'])

## Figure out what to do with the below columns

In [None]:
df['pub_rec_bankruptcies'].value_counts(dropna=False)
#figure out what to do with the null values

## Ashish - We can use 0.0 as 0.0 is occuring for 37339 times. And as per the conventions, we replace NaN with the mode of the column

In [None]:
df['emp_length'].value_counts(dropna=False)
# Figure out what to do with the null values, maybe nan values refers to 0 years of work experience?

## Ashish - I think yes they refer to 0 years. So let's do one thing--> Put NaN as <1 year

In [None]:
df['mths_since_last_delinq'].value_counts(dropna=False)
#Can drop this since most are missing values? Investigate further

# Ashish - Yes, we should drop this. 

In [None]:
df['mths_since_last_record'].value_counts(dropna=False)
# can drop this column since most of the values are missing? Investigate further

# Ashish- Column not required, we should drop this

In [None]:
df['revol_util'].value_counts(dropna=False)

# figure out what to do with the missing values.
# How is this dtype int64 even though percentage symbol is present in this column. Clean the percentage symbol

# Ashish -- From the output, what I can see is there is no NaN value. 0% will be treated as 0
    #    -- Also, int64 is for second column which represents counts of first column
    #    -- E.g.: 0% --977 , 977 is int64

In [None]:
df['emp_title'].value_counts(dropna=False)
# can we replace nan values with 'NA'?

# Ashish- Replace with NA or Other

In [None]:
df['title'].value_counts(dropna=False)
# can we replace nan values with 'NA'?

# Ashish - There are only 11 NaN value, so instead of creating another variable, we can assign mode(Title) value to NaN. It will not affect data much

In [None]:
df['term'].unique()
# Is it necessary to convert this categorical column into numeric format for analysis?

# Ashish - Yes, we should convert it to numeric

In [None]:
df.url
# we can drop this column? Are there any insights that can be derived from this column?
# df.drop('url', axis=1, inplace=True)

# Ashish -- Yes, this needs to be dropped. We won't get any insights

In [None]:
df.desc.value_counts(dropna=False)
# Since we are not doing NLP, we can drop this column?
# df.drop('desc', axis=1, inplace=True)

# Ashish - I think we should drop this, because we cannot use big description in any colummn

In [None]:
df.total_pymnt.value_counts()
#why are there 26 same occurences of total payment?

# Ashish - 26 occurences maybe for 26 installments of Loan EMI, ususally Loan EMI are of same amount

## Working with datetime values

In [None]:
def clean_date(i):
    i = i.str.replace('-', ' ')
    i = i.str[:4]+'20'+i.str[4:]
    i = pd.to_datetime(i)
    return(i)

In [None]:
df['issue_d'] = clean_date(df['issue_d'])

In [None]:
df['last_credit_pull_d'] = clean_date(df['last_credit_pull_d'])

In [None]:
df['last_credit_pull_d'].value_counts()


In [None]:
df[df['last_credit_pull_d'].isna()]
# Remove these two rows?

# Ashish - We should not drop this, let's keep this column

In [None]:
df['last_pymnt_d'] = clean_date(df['last_pymnt_d'])

In [None]:
df[df['last_pymnt_d'].isna()]
# Remove these 71 rows?

# Ashish - Let's drop them for now but in case required we'll add them in the final version