# EDA Template
## Import packagaes & Data

In [None]:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import sweetviz as sv

In [None]:
df = pd.read_csv('something.csv', low_memory=False)

## Data Processing

In [None]:
#Take first 50 columns
sdf = df.iloc[: , :50]

#lowercase for easy referencing
sdf.columns = sdf.columns.str.lower()

#cut out data not above 0
sdf = sdf.loc[sdf['sales'] > 0]

#cut out duplicate column
sdf = sdf.loc[:,~sdf.columns.duplicated()]

#add a few features
sdf['variable'] = ~sdf['variable'].isnull()
sdf['is variable'] = sdf['variable']>=1
sdf['variable'] = np.log(sdf['variable']+1)
sdf2['variable'] = sdf['variable'].clip(upper=15)

## Sweetviz

In [None]:
my_report = sv.analyze(sdf, "variable")
my_report.show_notebook()

## Data Cleanup / Imputation

In [None]:
#Code to sequential
codes = {"startup": 0 
         , "six_month": 1
         , "one_year": 2 
         , "three_year": 3
         , "five_year": 4 
         , "more": 5 
        }
sdf['blah'] = sdf['blah'].map(codes)

#Impute 0
sdf['v1'] = sdf['v1'].fillna(0)
#Impute median
sdf['variable'] = sdf['days to submit application'].fillna(sdf['variable'].median())


## Prep for modeling

In [None]:
#Dropping unwanted features and target variable
X = sdf.drop(columns=['blah']) 

#Define Categorical (inc boolean) and Continuous features
cat_features = ['color']
cont_features = X.drop(columns=cat_features).columns.tolist()

X = sm.add_constant(X)
y = sdf['dependent_variable']

X = pd.get_dummies(X, columns=cat_features, drop_first=True)

## Correlation

In [None]:
matrix = X.corr()['variable'].sort_values(key=abs, ascending=False)[1:6].to_frame()
cmap = sns.diverging_palette(220, 20, as_cmap=True)
f, ax = plt.subplots(figsize=(1,1.5))
sns.heatmap(matrix, vmax=.5, vmin=-.5, annot=True,cmap=cmap)

In [None]:
matrix = sdf.corr()

sns.heatmap(matrix,vmax=.8,square=True,cmap='BuPu', annot = True);

## Univariate Regressions

In [None]:
#independent and dependent variable
ind = 'days to submit application'
dep = 'is loan repaid'

In [None]:
X = sm.add_constant(sdf[ind])
#X = sm.add_constant(sdf[['loan amount (usd, final)', 'required pfp lenders']])
X = pd.get_dummies(X, drop_first=True)
y = sdf[dep]
model = sm.OLS(y, X).fit()
model.summary()

#f"Feature {ind} has a p-value of {model.pvalues[ind]} and coefficient of {model.params[ind]}."
# print(model.params[ind])
# print(model.rsquared)
#print(model.pvalues)
# print(sdf[dep].corr(X[ind]))

## Define custom continuous variable plotting function

In [None]:
def cont_plot(dep, ind, df):
    #plots continuous variable kdes given dependent variable, independent variable and dataframe
    
    repaid_df = df[df[dep] == 1]
    not_repaid_df = df[df[dep] == 0]
    
    custom_params = {"axes.spines.right": False, "axes.spines.top": False}
    sns.set_theme(style="white", rc=custom_params, font_scale = 2)
    sns.color_palette()
    
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.kdeplot(data=repaid_df, x=ind#, stat='density'
                ,color='royalblue', label='loan repaid'
                , fill=True, ax=ax).set(yticklabels=[])
    sns.kdeplot(data=not_repaid_df, x=ind#, stat='density'
                ,color='orange', label='not repaid'
                , fill=True, ax=ax).set(yticklabels=[])
    plt.xlim([0, df[ind].max()+1])
    ax.legend()
    plt.tight_layout()
    plt.show()

In [None]:
for ind in cont_features:
    cont_plot(dep, ind, sdf)


## Custom Categorical Plotting function

In [None]:
def cat_plot(dep, ind, df, max_recs=50):
    repaid_df = sdf[sdf[dep] == 1]
    not_repaid_df = sdf[sdf[dep] == 0]
    
    custom_params = {"axes.spines.right": False, "axes.spines.top": False}
    sns.set_theme(style="white", rc=custom_params, font_scale = 2)
    sns.color_palette()
    #relative counts
    relative_counts_repaid = repaid_df[ind].value_counts() / len(repaid_df)
    relative_counts_not_repaid = not_repaid_df[ind].value_counts() / len(not_repaid_df)

    #format into result df
    a = pd.DataFrame({ ind: relative_counts_repaid.index, 'pct': relative_counts_repaid
                        , dep: 'loan repaid'})
    b = pd.DataFrame({ ind: relative_counts_not_repaid.index, 'pct': relative_counts_not_repaid
                        , dep: 'not repaid'})
    result = pd.concat([a,b],ignore_index=True)

    #plot
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(data=result, x=ind, y='pct', hue=dep, ax=ax)

    ax.legend()
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    plt.tight_layout()
    plt.show()