In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Load the dataset
df = pd.read_csv('C:\\Users\\Twisha Bose\\Desktop\\Fraud Credit Card.csv')

# Display the first few rows to understand the dataset
print(df.head())


   Unnamed: 0 trans_date_trans_time        cc_num  \
0           0      21-06-2020 12:14  2.291160e+15   
1           1      21-06-2020 12:14  3.573030e+15   
2           2      21-06-2020 12:14  3.598220e+15   
3           3      21-06-2020 12:15  3.591920e+15   
4           4      21-06-2020 12:15  3.526830e+15   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ...  40.3207 -110.4360   
2     Lopez

In [3]:
# Drop unnecessary columns
df_cleaned = df.drop(columns=['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 
                              'city', 'state', 'zip', 'dob', 'trans_num'])

# Convert categorical columns to dummy variables
df_cleaned = pd.get_dummies(df_cleaned, columns=['merchant', 'category', 'gender', 'job'])

# Check the cleaned dataframe
print(df_cleaned.head())


     amt      lat      long  city_pop   unix_time  merch_lat  merch_long  \
0   2.86  33.9659  -80.9355    333497  1371816865  33.986391  -81.200714   
1  29.84  40.3207 -110.4360       302  1371816873  39.450498 -109.960431   
2  41.28  40.6729  -73.5365     34496  1371816893  40.495810  -74.196111   
3  60.05  28.5697  -80.8191     54767  1371816915  28.812398  -80.883061   
4   3.19  44.2529  -85.0170      1126  1371816917  44.959148  -85.884734   

   is_fraud  merchant_fraud_Abbott-Rogahn  merchant_fraud_Abbott-Steuber  ...  \
0         0                         False                          False  ...   
1         0                         False                          False  ...   
2         0                         False                          False  ...   
3         0                         False                          False  ...   
4         0                         False                          False  ...   

   job_Video editor  job_Visual merchandiser  job_Volunt

In [4]:
# Define the feature variables (X) and the target variable (y)
X = df_cleaned.drop('is_fraud', axis=1)
y = df_cleaned['is_fraud']


In [5]:
# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the train and test sets
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (444575, 1194), Test set: (111144, 1194)


In [6]:
# Instantiate the Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)


In [7]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [8]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110718
           1       0.63      0.58      0.60       426

    accuracy                           1.00    111144
   macro avg       0.81      0.79      0.80    111144
weighted avg       1.00      1.00      1.00    111144

Confusion Matrix:
[[110575    143]
 [   181    245]]
