## Project Guide

1. **[Import packages](#import_packages)**
2. **[Preprocessing](#Preprocessing)**
    1. **[Null Values Treatment](#null_values)**
    2. **[Upper Lower Capping](#capping)**

3. **[EDA](#EDA)**
4. **[Model Building](#Model_Building)**
5. **[Model Evaluation](#Model_Evaluation)**
6. **[Conclusion](#Conclusion)**

<a id='import_packages'></a>
## Data Loading And Packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv("BankCreditCard.csv" , index_col=None, sep=',')
data.head()

In [None]:
data.info()

In [None]:
data.summary()

<a id='Preprocessing'></a>
## Preprocessing

## Checking For NAs <a id='null_values'></a>

In [None]:
# Checking For NAs Variables Having NA's

drop_columns = []

def get_na(data): 
    null_vars = data.isnull().sum()
    null_vars = null_vars[null_vars > 0]
    if(len(null_vars) > 0):
        null_vars.sort_values(inplace=True)
        null_vars.plot.bar(figsize=(15,4))
    else:
        print("No column have NA values")
get_na(data)

## Removing Variables those having significant NA Values

In [None]:
drop_columns = []

## Impute Values for NAs

In [None]:
data.fillna(method=None, axis= 1, inplace=True)
method='ffill'
method='backfill'

## Categorical and Numerical Variables Splitting

In [None]:
data_categorical = x.select_type('O')
data_numeric = y.drop(data_categorical, axis = 1, inplace=False)

<a id='capping'></a>
## Outlier Treatment

In [None]:
# Upper Caping and Lower Caping for numerical Attribute

def set_caping(column, capping):
    mean = column.mean() # Mean of the column
    std = column.std() # Standard Deviation of the column
    
    UCL = mean + 3 * std
    LCL = mean - 3 * std
    if caping == "both":
        data[column > UCL] = UCL
        data[column < LCL] = LCL
    elif caping == "upper":
        data[column > UCL] = UCL
    elif caping == "lower":
        data[column > LCL] = LCL
    else:
        print("Please enter proper value of capping parameter. \n Possible values:\tboth\tupper\tlower")

# set_caping(data["age"], capping = "both")
# data.boxplot(column ="age")

## Splitting x(Independant) and y (Dependant Variable)

In [None]:
y = data.y
x = data.drop(y, axis = 1)

## One Hot Encoding for Categorical Variables

<a id='EDA'></a>
## EDA - Exploratory Data Analysis

## Correlation Plot For Numerical Data

In [None]:
# Remove Columns those have strong Correlation for avoid multicoliniearity Issue


## Checking for Normal Distribution

In [None]:
# Original Price
sale_price = original_data.SalePrice
# With Square Root Transformation
sqrt_price = np.sqrt(original_data.SalePrice)
# With Square Log Transformation
log_price = np.log(original_data.SalePrice)

# Compare all Possibale Ways to get proper Transformation
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"Sale Price":sale_price,"Log Sale Price ":log_price, "SQRT Price":sqrt_price })
prices.hist()
plt.tight_layout()

<a id='Feature_Engineering'></a>
## Feature Engineering

<a id='Model_Building'></a>
## Model Building Process

In [None]:
# Devide Dataset into Train and Test
from sklearn.model_selection import train_test_split

x_train,x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state=0)

print("\nx_train ",x_train.shape,"\nx_test ",x_test.shape, "\ny_train ",y_train.shape , "\ny_test ",y_test.shape)

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(x_train,y_train)
predicted = model.predict(x_test)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(x_train,y_train)
predict = model.predict(x_test)

In [None]:
# Decision Tree

# For Classification
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion="entropy", max_depth=3, min_samples_split=2, min_samples_leaf=1)
model = tree.fit(X_train,y_train)
predicted = model.predict(x_test)

# For Regression

In [None]:
# Random Forest
# n_estimators is the number of trees to be used in the forest, by default is 10.

# For Classification
from sklearn.ensemble import RandomForestClassifier 
model = RandomForestClassifier(n_estimators = 50, criterion='gini')
model.fit(X_train, y_train)

# For Regression

In [None]:
# SVM - Support Vector Machine

# For Classification
from sklearn import svm
model = svm.SVC(kernel='rbf')
model = model.fit(X_train,y_train)
predicted = model.predict(x_test)

# For Regression

In [None]:
# ADABOOST - Adaptive Boosting


In [None]:
# Gradient Decent


In [None]:
# XG BOOST - Extream Gradient Boosting


In [None]:
# Time Series

<a id='Model_Evaluation'></a>
## Evaluating Model

<a id='Conclusion'></a>
## Conclusion