---
## Random Forest<br> <font size=3.4> Dataset:fraud_check.csv<br><font size=2>Problem Statement:<br>Use random forest and ensemble methods to prepare a model on fraud data treating those who have taxable_income <= 30000 as "Risky" and others are "Good"
---

In [71]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [72]:
#Changing the categorical variables into dummies.
#df = pd.get_dummies('fraud_check.csv')
df = pd.read_csv('fraud_check.csv')

#Converting the Target variable i.e. Taxable Income into Categorical (As mentioned in the problem statement)
df['Category'] = pd.cut(df['Taxable.Income'], 
                        bins=[0,30000, np.inf], 
                        labels=['Risky','Good'],
                        include_lowest=True)
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Category
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good
...,...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES,Good
596,YES,Divorced,69967,55369,2,YES,Good
597,NO,Divorced,47334,154058,0,YES,Good
598,YES,Married,98592,180083,17,NO,Good


In [73]:
df.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
Category           0
dtype: int64

In [74]:
df=pd.get_dummies(df,columns = ["Category","Undergrad","Marital.Status","Urban"],drop_first=True)

In [75]:
df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Category_Good,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
0,68833,50047,10,1,0,0,1,1
1,33700,134075,18,1,1,0,0,1
2,36925,160205,30,1,0,1,0,1
3,50190,193264,15,1,1,0,1,1
4,81002,27533,28,1,0,1,0,0
...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,1,0,0,1
596,69967,55369,2,1,1,0,0,1
597,47334,154058,0,1,0,0,0,1
598,98592,180083,17,1,1,1,0,0


In [76]:
# Normalization function 
def norm_func(i):
    x = (i-i.min())/(i.max()-i.min())
    return (x)

In [77]:
df_norm = norm_func(df.iloc[:,1:])
df_norm.tail(10)

Unnamed: 0,City.Population,Work.Experience,Category_Good,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
590,0.341473,0.466667,1.0,0.0,1.0,0.0,1.0
591,0.615406,0.6,0.0,1.0,0.0,1.0,1.0
592,0.283703,0.533333,1.0,1.0,0.0,1.0,0.0
593,0.610256,0.333333,1.0,0.0,0.0,0.0,1.0
594,0.412341,0.3,1.0,0.0,1.0,0.0,1.0
595,0.078811,0.233333,1.0,1.0,0.0,0.0,1.0
596,0.170058,0.066667,1.0,1.0,0.0,0.0,1.0
597,0.73724,0.0,1.0,0.0,0.0,0.0,1.0
598,0.88681,0.566667,1.0,1.0,1.0,0.0,0.0
599,0.760683,0.533333,1.0,0.0,0.0,0.0,0.0


In [78]:
# Random Forest Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [79]:
# array = df.values
# X = array[:,1:6]
# Y = array[:,6]
X= df_norm.drop(['Category_Good'], axis=1)
Y = df_norm['Category_Good']

### Let's try if we can increase thee cv score using ensemble techniques<br><br>Bagging

In [81]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

seed = 7

cart = DecisionTreeClassifier()
model1 = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results1 = cross_val_score(model1, X, Y, cv=kfold)
print(results1.mean()*100)

74.33333333333333


### Boosting

In [82]:
# AdaBoost Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

model2 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results2 = cross_val_score(model2, X, Y, cv=kfold)
print(results2.mean()*100)

77.5


### Stacking

In [83]:
# Stacking Ensemble for Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

### Iteration-1

In [84]:
# create the sub models
estimators = []
model3 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model3))
model4 = DecisionTreeClassifier()
estimators.append(('cart', model4))
model5 = SVC()
estimators.append(('svm', model5))
model6 = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
estimators.append(('bagging', model6))
model7 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('boosting', model7))

# create the ensemble modelIter
ensemble = VotingClassifier(estimators)
results3 = cross_val_score(ensemble, X, Y, cv=kfold)
print(results3.mean()*100)

78.99999999999999


### Iteration-2

In [85]:
# create the sub models
estimators = []
model8 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model8))
model9 = DecisionTreeClassifier()
estimators.append(('cart', model9))
model10 = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
estimators.append(('bagging', model10))
model11 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('boosting', model11))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results4 = cross_val_score(ensemble, X, Y, cv=kfold)
print(results4.mean()*100)

75.33333333333333


### Iteration-3

In [86]:
# create the sub models
estimators = []
model12 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model12))
model13 = DecisionTreeClassifier()
estimators.append(('cart', model13))
model14 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('boosting', model14))

# create the ensemble modSel
ensemble = VotingClassifier(estimators)
results5 = cross_val_score(ensemble, X, Y, cv=kfold)
print(results5.mean()*100)

78.33333333333333


### Iteration-4

In [87]:
# create the sub models
estimators = []
model15 = DecisionTreeClassifier()
estimators.append(('cart', model15))
model16 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('boosting', model16))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results6 = cross_val_score(ensemble, X, Y, cv=kfold)
print(results6.mean()*100)

63.000000000000014


### Iteration-5

In [88]:
# create the sub models
estimators = []
model17 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model17))
model18 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('boosting', model18))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results6 = cross_val_score(ensemble, X, Y, cv=kfold)
print(results6.mean()*100)

77.5


### Iteration-6

In [89]:
# create the sub models
estimators = []
model19 = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
estimators.append(('bagging', model19))
model20 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
estimators.append(('boosting', model20))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results7 = cross_val_score(ensemble, X, Y, cv=kfold)
print(results7.mean()*100)

73.16666666666667


### Since the cv score for iteration 1 was the max, so we can consider it to be our final model