Maximize Fraud Detection Accuracy: The primary objective is to accurately identify fraudulent cases within the dataset. By building robust decision tree and random forest models, the aim is to maximize the accuracy of fraud detection, thereby reducing financial losses and maintaining trust with stakeholders.



Minimize False Positives: While detecting fraud is essential, minimizing false positives is equally important to avoid unnecessary investigations and customer inconvenience. By optimizing the models to reduce false positive rates, the company can allocate resources more efficiently and enhance customer satisfaction.

In [30]:
"""
Constraints :
    'Undergrad', 
    'Marital.Status', 
    'Taxable.Income', 
    'City.Population',
    'Work.Experience', 
    'Urban'

"""

"\nConstraints :\n    'Undergrad', \n    'Marital.Status', \n    'Taxable.Income', \n    'City.Population',\n    'Work.Experience', \n    'Urban'\n\n"

In [2]:
import pandas as pd 
import numpy as np

In [3]:
df= pd.read_csv("D:\\1-Data Science\\6 - Machine Learning\\Decision Tree\\Data\\Fraud_check.csv")

In [4]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [5]:
df.columns

Index(['Undergrad', 'Marital.Status', 'Taxable.Income', 'City.Population',
       'Work.Experience', 'Urban'],
      dtype='object')

In [6]:
df['Taxable.Income'] = df['Taxable.Income'].replace({lambda x:x<30000:'Risky', lambda y: y>30000:'Good'})
                                                    

In [7]:
bin_edge = [float('-inf'), 30000, float('inf')]
bin_label = ['Risky','Good']

In [8]:
df['Taxable.Income'] = pd.cut(df['Taxable.Income'], bins=bin_edge, labels=bin_label, right = False)

In [9]:
df['Taxable.Income']

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: Taxable.Income, Length: 600, dtype: category
Categories (2, object): ['Risky' < 'Good']

In [10]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,Good,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Good,193264,15,YES
4,NO,Married,Good,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,Good,39492,7,YES
596,YES,Divorced,Good,55369,2,YES
597,NO,Divorced,Good,154058,0,YES
598,YES,Married,Good,180083,17,NO


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Undergrad        600 non-null    object  
 1   Marital.Status   600 non-null    object  
 2   Taxable.Income   600 non-null    category
 3   City.Population  600 non-null    int64   
 4   Work.Experience  600 non-null    int64   
 5   Urban            600 non-null    object  
dtypes: category(1), int64(2), object(3)
memory usage: 24.3+ KB


In [12]:
df.describe()

Unnamed: 0,City.Population,Work.Experience
count,600.0,600.0
mean,108747.368333,15.558333
std,49850.075134,8.842147
min,25779.0,0.0
25%,66966.75,8.0
50%,106493.5,15.0
75%,150114.25,24.0
max,199778.0,30.0


In [13]:
input = df.drop('Taxable.Income', axis = 'columns')
target = df['Taxable.Income']

In [14]:
input


Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,NO,Single,50047,10,YES
1,YES,Divorced,134075,18,YES
2,NO,Married,160205,30,YES
3,YES,Single,193264,15,YES
4,NO,Married,27533,28,NO
...,...,...,...,...,...
595,YES,Divorced,39492,7,YES
596,YES,Divorced,55369,2,YES
597,NO,Divorced,154058,0,YES
598,YES,Married,180083,17,NO


In [15]:
target

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: Taxable.Income, Length: 600, dtype: category
Categories (2, object): ['Risky' < 'Good']

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
input.columns

Index(['Undergrad', 'Marital.Status', 'City.Population', 'Work.Experience',
       'Urban'],
      dtype='object')

In [18]:
underground_n = LabelEncoder()
maritial_n = LabelEncoder()
urban = LabelEncoder()
population = LabelEncoder()
experience = LabelEncoder()

In [19]:
input['Undergrad_n'] = underground_n.fit_transform(input['Undergrad'])
input['Marital_Status_n'] = maritial_n.fit_transform(input['Marital.Status'])
input['Urban_n'] = urban.fit_transform(input['Urban'])
input['City_Population_n'] = population.fit_transform(input['City.Population'])
input['Work_Experience_n'] = experience.fit_transform(input['Work.Experience'])

In [20]:
input

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Undergrad_n,Marital_Status_n,Urban_n,City_Population_n,Work_Experience_n
0,NO,Single,50047,10,YES,0,2,1,84,10
1,YES,Divorced,134075,18,YES,1,0,1,398,18
2,NO,Married,160205,30,YES,0,1,1,481,30
3,YES,Single,193264,15,YES,1,2,1,574,15
4,NO,Married,27533,28,NO,0,1,0,4,28
...,...,...,...,...,...,...,...,...,...,...
595,YES,Divorced,39492,7,YES,1,0,1,55,7
596,YES,Divorced,55369,2,YES,1,0,1,107,2
597,NO,Divorced,154058,0,YES,0,0,1,459,0
598,YES,Married,180083,17,NO,1,1,0,533,17


In [21]:
input_n = input.drop(['Undergrad', 'Marital.Status', 'Urban', 'City.Population',
       'Work.Experience'], axis = 'columns')
input_n

Unnamed: 0,Undergrad_n,Marital_Status_n,Urban_n,City_Population_n,Work_Experience_n
0,0,2,1,84,10
1,1,0,1,398,18
2,0,1,1,481,30
3,1,2,1,574,15
4,0,1,0,4,28
...,...,...,...,...,...
595,1,0,1,55,7
596,1,0,1,107,2
597,0,0,1,459,0
598,1,1,0,533,17


In [22]:
target

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: Taxable.Income, Length: 600, dtype: category
Categories (2, object): ['Risky' < 'Good']

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train , X_test, y_train, y_test = train_test_split(input_n, target, test_size = 0.2)

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
model = DecisionTreeClassifier()

In [27]:
model.fit(X_train, y_train)

In [28]:
model.score(X_test, y_test)

0.7583333333333333

In [29]:
pred = model.predict(X_test)
pred

array(['Risky', 'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Good', 'Go