```{contents}
```

## Data Cleaning

In [1]:


import pandas as pd

url = "https://github.com/svgoudar/datasets/blob/main/loan_applications.csv?raw=true"
df = pd.read_csv(url)

# Quick overview
df.head()


Unnamed: 0,application_id,customer_id,application_date,loan_type,loan_amount_requested,loan_tenure_months,interest_rate_offered,purpose_of_loan,employment_status,monthly_income,...,existing_emis_monthly,debt_to_income_ratio,property_ownership_status,residential_address,applicant_age,gender,number_of_dependents,loan_status,fraud_flag,fraud_type
0,c8bf0bea-70e6-4870-9125-41b8210c527f,CUST109427,2023-04-09,Business Loan,604000.0,12,11.66,Medical Emergency,Retired,34700.0,...,1100.0,3.17,Rented,"94/31, Sehgal Zila, Vadodara-380521, Anantapur...",28,Female,3,Approved,0,
1,91224cec-3544-4bc7-ac15-a9792da54c02,CUST106146,2023-09-23,Car Loan,100000.0,240,13.62,Education,Unemployed,51600.0,...,0.0,0.0,Owned,"H.No. 00, Sheth Chowk, Ichalkaranji 006728, Im...",44,Other,3,Approved,0,
2,4efcd02d-4a03-4ab7-9bd1-0ff430493d0c,CUST100674,2023-05-22,Education Loan,431000.0,60,11.4,Medical Emergency,Self-Employed,14800.0,...,4600.0,31.08,Rented,"H.No. 81, Dutta Path, Kozhikode-340301, Tadepa...",56,Other,4,Approved,0,
3,a61337d4-ba04-4a68-b492-2cb8266e6ed7,CUST106466,2024-07-09,Car Loan,324000.0,120,10.36,Debt Consolidation,Self-Employed,28800.0,...,4000.0,13.89,Rented,"H.No. 022, Rege Road, Tiruvottiyur-927857, Aur...",27,Other,4,Declined,0,
4,a8d1639e-170b-41b2-826a-55c7dae38d16,CUST112319,2023-11-20,Personal Loan,100000.0,36,14.14,Business Expansion,Salaried,43900.0,...,1100.0,2.51,Rented,"85/24, Bali Zila, Sambalpur 922071, Tumkur, Ke...",50,Other,0,Declined,0,




---

  ### Handling Missing Values


In [5]:


# Check missing values
print(df.isnull().sum())

# Fill missing loan_amount with mean
df['loan_amount_requested'].fillna(df['loan_amount_requested'].mean(), inplace=True)

# Drop rows with missing status
df.dropna(subset=['loan_status'], inplace=True)

# Interpolate missing numeric values (if any)
df['loan_amount_requested'] = df['loan_amount_requested'].interpolate() 


application_id                   0
customer_id                      0
application_date                 0
loan_type                        0
loan_amount_requested            0
loan_tenure_months               0
interest_rate_offered            0
purpose_of_loan                  0
employment_status                0
monthly_income                   0
cibil_score                      0
existing_emis_monthly            0
debt_to_income_ratio             0
property_ownership_status        0
residential_address              0
applicant_age                    0
gender                           0
number_of_dependents             0
loan_status                      0
fraud_flag                       0
fraud_type                   48974
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loan_amount_requested'].fillna(df['loan_amount_requested'].mean(), inplace=True)




---

### Handling Duplicates


In [None]:
# Check duplicates
print(df.duplicated().sum())

# Drop duplicate rows based on customer_id and loan_amount
df.drop_duplicates(subset=['customer_id', 'loan_amount'], inplace=True)




---

### Data Type Conversion


In [None]:


# Convert loan_amount to numeric
df['loan_amount'] = pd.to_numeric(df['loan_amount'], errors='coerce')

# Convert application_date to datetime
df['application_date'] = pd.to_datetime(df['application_date'], errors='coerce')




---

### String Cleaning / Formatting



In [None]:

# Remove leading/trailing spaces and standardize case
df['customer_name'] = df['customer_name'].str.strip().str.title()

# Filter rows containing 'Approved' in status
df = df[df['status'].str.contains('Approved')]




---

* ## Renaming / Reordering Columns


In [11]:
# Rename column
df.rename(columns={'loan_amount_requested': 'amount'}, inplace=True)

# Reorder columns
df = df[['customer_id', 'amount', 'loan_status', 'application_date']]

df.loc[:, df.columns.str.contains("loan")]

Unnamed: 0,loan_status
0,Approved
1,Approved
2,Approved
3,Declined
4,Declined
...,...
49995,Approved
49996,Approved
49997,Approved
49998,Approved




---

### Handling Outliers / Filtering Invalid Data


In [None]:

# Remove loans with negative or extremely high amounts
df = df[(df['amount'] > 0) & (df['amount'] < 100000)]

# Cap loan amount at 50000
df['amount'] = df['amount'].clip(0, 50000)




---

### Replacing Values



In [None]:
# Standardize status values
df['status'].replace({'approved': 'Approved', 'denied': 'Rejected'}, inplace=True)




---

### Reset / Set Index



In [None]:

# Set customer_id as index
df.set_index('customer_id', inplace=True)

# Reset index if needed
df.reset_index(inplace=True)




---

**Summary of Cleaning Steps Applied**

1. Filled missing numeric values (`fillna`)
2. Dropped rows with missing critical columns (`dropna`)
3. Removed duplicates (`drop_duplicates`)
4. Converted data types (`astype`, `to_numeric`, `to_datetime`)
5. Cleaned strings (`.str.strip()`, `.str.title()`)
6. Handled outliers (`clip`, boolean filtering)
7. Replaced inconsistent values (`replace`)
8. Managed index (`set_index`, `reset_index`)

