<a href="https://colab.research.google.com/github/vyshnxviii/CodSoft1/blob/main/Credit_Card_Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TASK 2** - CREDIT CARD FRAUD DETECTION

In [8]:
#import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Load the training dataset
train_data = pd.read_csv('/content/fraudTrain.csv')

# Load the testing dataset
test_data = pd.read_csv('/content/fraudTest.csv')

# Define the list of non-numeric columns to drop
non_numeric_columns = ['trans_date_trans_time', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num']

# Drop unnecessary columns and the target variable 'is_fraud'
train_data = train_data.drop(['Unnamed: 0'] + non_numeric_columns , axis=1)
test_data = test_data.drop(['Unnamed: 0'] + non_numeric_columns , axis=1)

# Handle missing values in both datasets
imputer = SimpleImputer(strategy='mean')
numeric_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
train_data[numeric_cols] = imputer.fit_transform(train_data[numeric_cols])
test_data[numeric_cols] = imputer.transform(test_data[numeric_cols])

# Encode categorical variables using Label Encoding
label_encoder = LabelEncoder()
categorical_cols = train_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])

# Ensure the datasets have the same columns by taking their intersection
common_columns = train_data.columns.intersection(test_data.columns)
train_data = train_data[common_columns]
test_data = test_data[common_columns]

# Split into features (X) and the target variable (y)
target_column_name = 'is_fraud'
X_train = train_data.drop(columns=target_column_name)
y_train = train_data[target_column_name]
X_test = test_data.drop(target_column_name, axis=1)
y_test = test_data[target_column_name]

# Scale numeric features
numeric_cols = [col for col in X_train.columns if col != 'is_fraud']
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Assuming the continuous 'is_fraud' values range from 0 to 1
y_train = (y_train > 0.5).astype(int)
y_test = (y_test > 0.5).astype(int)

# Train Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Train Decision Tree model
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

# Train Random Forest model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

# Make predictions on the test data for all models
y_pred_logistic = logistic_model.predict(X_test)
y_pred_decision_tree = decision_tree_model.predict(X_test)
y_pred_random_forest = random_forest_model.predict(X_test)

# Model evaluation
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print(f"Model: {model_name}")
    print("Accuracy: {:.2f}".format(accuracy))
    print("Precision: {:.2f}".format(precision))
    print("Recall: {:.2f}".format(recall))
    print("F1 Score: {:.2f}".format(f1))
    print("ROC AUC: {:.2f}".format(roc_auc))
    print("Confusion Matrix:")
    print(conf_matrix)
    print()

evaluate_model(y_test, y_pred_logistic, "Logistic Regression")
evaluate_model(y_test, y_pred_decision_tree, "Decision Tree")
evaluate_model(y_test, y_pred_random_forest, "Random Forest")

# Print the predicted results
predicted_results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_logistic
})

print("Predicted Results:")
print(predicted_results)

Model: Logistic Regression
Accuracy: 1.00
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
ROC AUC: 0.50
Confusion Matrix:
[[470687    134]
 [  2059      0]]

Model: Decision Tree
Accuracy: 0.99
Precision: 0.20
Recall: 0.23
F1 Score: 0.21
ROC AUC: 0.61
Confusion Matrix:
[[468924   1897]
 [  1584    475]]

Model: Random Forest
Accuracy: 1.00
Precision: 0.49
Recall: 0.05
F1 Score: 0.09
ROC AUC: 0.52
Confusion Matrix:
[[470715    106]
 [  1958    101]]

Predicted Results:
        Actual  Predicted
0            0          0
1            0          0
2            0          0
3            0          0
4            0          0
...        ...        ...
472875       0          0
472876       0          0
472877       0          0
472878       0          0
472879       0          0

[472880 rows x 2 columns]
