In [30]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from pickle import dump

## 1. Data loading

In [3]:
data = pd.read_csv('Datasets/Fraud_check.csv')
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


## 2.Data Analysis

In [4]:
data.shape

(600, 6)

In [5]:
data.dtypes

Undergrad          object
Marital.Status     object
Taxable.Income      int64
City.Population     int64
Work.Experience     int64
Urban              object
dtype: object

In [6]:
data.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

## 3.Data Preprocessing

In [7]:
data['Status'] = data['Taxable.Income'].apply(lambda x : 'Risky' if x<=30000  else 'Good')
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Status
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good


In [8]:
data.rename({'Undergrad':'graduation','Marital.Status':'Marital_Status','Taxable.Income':'Taxable_Income'
            ,'City.Population':'City_Population','Work.Experience':'Work_Experience'},axis=1,inplace=True)

In [9]:
data = pd.get_dummies(data=data,columns=['graduation','Marital_Status','Urban'])

In [12]:
lbl_encode = LabelEncoder()
data['Status'] = lbl_encode.fit_transform(data['Status'])
data.head()

Unnamed: 0,Taxable_Income,City_Population,Work_Experience,Status,graduation_NO,graduation_YES,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Urban_NO,Urban_YES
0,68833,50047,10,0,1,0,0,0,1,0,1
1,33700,134075,18,0,0,1,1,0,0,0,1
2,36925,160205,30,0,1,0,0,1,0,0,1
3,50190,193264,15,0,0,1,0,0,1,0,1
4,81002,27533,28,0,1,0,0,1,0,1,0


In [13]:
data.isna().sum()

Taxable_Income             0
City_Population            0
Work_Experience            0
Status                     0
graduation_NO              0
graduation_YES             0
Marital_Status_Divorced    0
Marital_Status_Married     0
Marital_Status_Single      0
Urban_NO                   0
Urban_YES                  0
dtype: int64

## Spliting X and y

In [14]:
X = data.drop(labels=('Status'),axis=1)
y = data[['Status']]

In [15]:
X.head()

Unnamed: 0,Taxable_Income,City_Population,Work_Experience,graduation_NO,graduation_YES,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Urban_NO,Urban_YES
0,68833,50047,10,1,0,0,0,1,0,1
1,33700,134075,18,0,1,1,0,0,0,1
2,36925,160205,30,1,0,0,1,0,0,1
3,50190,193264,15,0,1,0,0,1,0,1
4,81002,27533,28,1,0,0,1,0,1,0


In [16]:
y.head()

Unnamed: 0,Status
0,0
1,0
2,0
3,0
4,0


## train test split

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,shuffle=True,random_state=37,stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((510, 10), (90, 10), (510, 1), (90, 1))

## Base model
### 4.1 Model building ad training

In [24]:
base_model = RandomForestClassifier()
base_model.fit(X_train,y_train)

RandomForestClassifier()

## 4.2 Model testing and evaluation

In [25]:
y_pred = base_model.predict(X_test)

In [27]:
accuracy_score(y_test,y_pred)

1.0

In [28]:
confusion_matrix(y_test,y_pred)

array([[71,  0],
       [ 0, 19]])

In [29]:
print('\n',classification_report(y_test,y_pred))


               precision    recall  f1-score   support

           0       1.00      1.00      1.00        71
           1       1.00      1.00      1.00        19

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90



## since base model is giving the best predictions, there is no hyperparameter tuning required

## model deployment

In [31]:
dump(base_model,open('fraud_check.pkl','wb'))