In [None]:
# 'os' module provides functions for interacting with the operating system
import os

# 'Numpy' is used for mathematical operations on large, multi-dimensional arrays and matrices
import numpy as np

# 'Pandas' is used for data manipulation and analysis
import pandas as pd

# 'Matplotlib' is a data visualization library for 2D and 3D plots, built on numpy
from matplotlib import pyplot as plt
%matplotlib inline

# 'Seaborn' is based on matplotlib; used for plotting statistical graphics
import seaborn as sns

# to suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# loading the data and setting the unique client_id as the index::

df = pd.read_csv('loans.csv', index_col = 'client_id')

In [None]:
# # showing the first 5 rows of the dataset:
df.head()

In [None]:
# To check the Dimensions of the dataset:
df.shape

In [None]:
# Checking the info of the data:
df.info()

### 3. Checking the datatypes of the columns

In [None]:
df.dtypes

### 4. Converting the data types of columns

    - loan_id to object
    - repaid to category dtype
    - loan_start and loan_end to date type
    

In [None]:
# loan_id:

df['loan_id'] = df['loan_id'].astype('object')

# repaid:

df['repaid'] = df['repaid'].astype('category')

In [None]:
# loan_start:

df['loan_start'] = pd.to_datetime(df['loan_start'], format = '%Y-%m-%d')


# loan_end:

df['loan_end'] = pd.to_datetime(df['loan_end'], format = '%Y-%m-%d')

#### Checking the datatypes again:

In [None]:
df.dtypes

### 5. Summary Statistics of the data

In [None]:
# Summary Statistics for Numerical data:
df.describe()

In [None]:
# Summary Statistics for Categorical data:
df.describe(exclude=[np.number])

### 6. Missing Values

In [None]:
# use isnull().sum() to check for missing values
df.isnull().sum()

### 7. Outliers Treatment

To check for the presence of outliers, we plot Boxplot.

In [None]:
# For loan_amount
df['loan_amount'].plot(kind='box')
plt.show()

In [None]:
# For rate
df['rate'].plot(kind='box')
plt.show()

### 8. Transformation

### 8a. SQRT transformation

In [None]:
df['SQRT_RATE'] = df['rate']**0.5

In [None]:
df['sqrt_rate'] = np.sqrt(df['rate'])

In [None]:
df.head()

In [None]:
#checking the skewness, kurtosis between the original and transformed data:
print("The skewness of the original data is {}".format(df.rate.skew()))
print('The skewness of the SQRT transformed data is {}'.format(df.SQRT_RATE.skew()))

print('')

print("The kurtosis of the original data is {}".format(df.rate.kurt()))
print("The kurtosis of the SQRT transformed data is {}".format(df.SQRT_RATE.kurt()))

In [None]:
# plotting the distribution

fig, axes = plt.subplots(1,2, figsize=(15,5))
sns.distplot(df['rate'], ax=axes[0])
sns.distplot(df['sqrt_rate'], ax=axes[1])

plt.show()


### 8b. Log Transformation

In [None]:
df['Log Rate'] = np.log(df['rate'])

In [None]:
df.head()