In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

1. Import the dataset and understand it. 

In [None]:
df = pd.read_csv('loan_data.csv') #Loaded the file

In [None]:
def get_rows_and_columns():
    print("Rows, Columns:", df.shape)

In [None]:
def convert_int_rate():
    df['int_rate'] = df['int_rate'].apply(lambda x: float(x.rstrip('%'))) #converted the int Rate to Fload Data Type

In [None]:
def check_dtypes():
    print(df.dtypes) #to check the datatypes of all columns

In [None]:
def clean_data():
    df.dropna(axis=1, how='all', inplace=True) # Remove the columns having complete NaN 
    print("Successfully Removed columns having NaN")

In [None]:
def filter_loan_status():
    loan_status_counts = df['loan_status'].value_counts() # Gets value counts of 'loan_status' column
    filtered_loan_status = loan_status_counts.loc[['Fully Paid', 'Charged Off']] # Filter only 'Fully Paid' and 'Charged Off' categories
    print(filtered_loan_status)

In [None]:
def extract_emp_length():
    df['emp_length'] = df['emp_length'].str.extract('(\d+)').astype(float) #extracting the numerical value
    print("Sucessfuly Extracted numerical value from emp_length")

In [None]:
def clean_term():
    df['term'] = df['term'].apply(lambda x: int(x.strip().split()[0])) #removed the 'month' from term and updated it to just the numeric value

In [None]:
def create_risky_loan_applicant():
    df['risky_loan_applicant'] = np.where(df['loan_amnt'] <= df['funded_amnt'], 0, 1)

In [None]:
def plot_loan_status():
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    sns.countplot(x='grade', hue='loan_status', data=df, ax=axes[0])
    sns.countplot(x='term', hue='loan_status', data=df, ax=axes[1])
    sns.countplot(x='verification_status', hue='loan_status', data=df, ax=axes[2])
    plt.show()

In [None]:
def categorize_experience(emp_len):
    if emp_len <= 1:
        return 'Fresher'
    elif 1 < emp_len <= 3:
        return 'Junior'
    elif 3 < emp_len <= 7:
        return 'Senior'
    else:
        return 'Expert'

In [None]:
def create_emp_category():
    df['emp_category'] = df['emp_length'].apply(categorize_experience)

In [None]:
def plot_loan_distribution():
    grade_loan_sum = df.groupby('grade')['loan_amnt'].sum()
    plt.figure(figsize=(8, 8))
    plt.pie(grade_loan_sum, labels=grade_loan_sum.index, autopct='%1.1f%%', startangle=140)
    plt.title('Loan Amount Distribution by Grade')
    plt.show()

In [None]:
def save_cleaned_data(filename):
    df.to_csv(filename, index=False)
#2. List down the number of rows and columns. 


2. List down the number of rows and columns. 

In [None]:
get_rows_and_columns()

3. ‘Int_rate’ column is character type. With the help of lambda function convert into float type. 

In [None]:
convert_int_rate()

4. Check the datatype of each column. 

In [None]:
check_dtypes()

5. Cleaning the dataset- Remove the columns having complete NaN value in the entire dataset. 

In [None]:
clean_data()

6. Write the code to find the value counts of the ‘loan_status’ category column and filter only the ‘fully paid’ and ‘charged off’ categories. 

In [None]:
filter_loan_status()

7. Filter the ‘Emp_Len’ column to extract the numerical value from the string. 

In [None]:
extract_emp_length()

8. Using the Lambda function, remove the month from the ‘term’ column such that ‘36 months’, ‘60 months’ appear as 36 and 60 respectively. 

In [None]:
clean_term()

9. Create a new column as risky_loan_applicant by comparing loan_amnt and funded_amnt with the following criteria - If loan_amnt is less than equals to funded_amnt set it as ‘0’ else set it as ‘1’. 

In [None]:
create_risky_loan_applicant()

10. Using the bar plot visualize the loan_status column against categorical column grade, term, verification_status. Write the observation from each graph. 

In [None]:
plot_loan_status()

11.Using a user defined function convert the ‘emp_len’ column into categorical column as follows - If emp_len is less than equals to 1 then recode as ‘fresher’. If emp_len is greater than 1 and less than 3 then recode as ‘junior’. If emp_len is greater than 3 and less than 7 then recode as ‘senior’ If emp_len is greater than 7 then recode as ‘expert’. 

In [None]:
create_emp_category()

12.Find the sum of ‘loan_amnt’ for each grade and display the distribution of ‘loan_amnt’ using a pie plot.

In [None]:
plot_loan_distribution()

In [None]:
save_cleaned_data('cleaned_loan_data.csv')
print("visualization completed and successfully created new csvfile")