In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression  # For Linear Regression
from sklearn import metrics  # To evaluate the model
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For scaling and encoding


In [9]:
# Load training and testing datasets
df_train = pd.read_csv('fraudTrain.csv')
df_test = pd.read_csv('fraudTest.csv')

# Add source column to distinguish datasets during preprocessing
df_train['source'] = 'train'
df_test['source'] = 'test'

# Combine datasets for consistent preprocessing
df = pd.concat([df_train, df_test], axis=0, ignore_index=True)


In [10]:
# Drop unnecessary columns
df.drop(['Unnamed: 0', 'unix_time', 'trans_num'], axis=1, inplace=True)

# Handle 'dob' and calculate 'age'
if 'dob' in df.columns:
    from datetime import datetime
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    current_year = datetime.now().year
    df['age'] = current_year - df['dob'].dt.year
    df.drop(columns=['dob'], inplace=True)
else:
    print("Warning: 'dob' column not found in the dataset!")
    df['age'] = 40  # Placeholder if 'dob' or 'age' is missing

# Encode 'gender' column as binary
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)

# One-hot encode columns with few categories
df = pd.get_dummies(df, columns=['state', 'category'], drop_first=True)

# Label encode columns with many unique values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in ['merchant', 'first', 'last', 'street', 'city', 'job']:
    df[col] = label_encoder.fit_transform(df[col])

# Feature engineering: Add interaction features or create bins
df['age_gender_interaction'] = df['age'] * df['gender']
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 40, 60, 100], labels=['<25', '25-40', '40-60', '60+'])
df = pd.get_dummies(df, columns=['age_group'], drop_first=True)

# Split the combined dataset back into training and testing datasets
df_train = df[df['source'] == 'train'].drop(columns=['source'])
df_test = df[df['source'] == 'test'].drop(columns=['source'])

# Separate features (X) and target (y)
X_train = df_train.drop(columns=['is_fraud'])
y_train = df_train['is_fraud']
X_test = df_test.drop(columns=['is_fraud'])
y_test = df_test['is_fraud']


In [12]:
# Check data types before scaling
print("Before scaling, data types:")
print(X_train.dtypes)

# Handle datetime column if it exists
if 'trans_date_trans_time' in X_train.columns:
    X_train['year'] = pd.to_datetime(X_train['trans_date_trans_time']).dt.year
    X_train['month'] = pd.to_datetime(X_train['trans_date_trans_time']).dt.month
    X_train.drop(columns=['trans_date_trans_time'], inplace=True)
    X_test['year'] = pd.to_datetime(X_test['trans_date_trans_time']).dt.year
    X_test['month'] = pd.to_datetime(X_test['trans_date_trans_time']).dt.month
    X_test.drop(columns=['trans_date_trans_time'], inplace=True)

# Encode categorical columns
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in X_train.select_dtypes(include=['object']).columns:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])

# Check data types again
print("After preprocessing, data types:")
print(X_train.dtypes)

# Standardize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Before scaling, data types:
trans_date_trans_time      object
cc_num                      int64
merchant                    int64
amt                       float64
first                       int64
                           ...   
category_travel              bool
age_gender_interaction      int64
age_group_25-40              bool
age_group_40-60              bool
age_group_60+                bool
Length: 84, dtype: object
After preprocessing, data types:
cc_num               int64
merchant             int64
amt                float64
first                int64
last                 int64
                    ...   
age_group_25-40       bool
age_group_40-60       bool
age_group_60+         bool
year                 int32
month                int32
Length: 85, dtype: object


In [14]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [16]:
# Train the Linear Regression model
from sklearn.linear_model import LinearRegression
from sklearn import metrics

linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_test_predictions = linear_model.predict(X_test_scaled)

# Evaluate the model
mse = metrics.mean_squared_error(y_test, y_test_predictions)
r_squared = metrics.r2_score(y_test, y_test_predictions)

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("R-squared:", r_squared)


Mean Squared Error (MSE): 0.003736869382023784
R-squared: 0.02811377161426476


In [18]:
# Linear Regression Results and Observations

# The Linear Regression model was applied to predict the 'is_fraud' target variable.
# Results:
# - Mean Squared Error (MSE): 0.003736869382023784
# - R-squared: 0.02811377161426476

# Observations:
# 1. The R-squared value (~2.81%) indicates that the model explains only a small portion of the variance.
# 2. This result is expected because fraud detection often involves non-linear relationships 
#    that Linear Regression cannot capture effectively.
# 3. The dataset might have imbalanced classes (fewer fraudulent transactions), making it 
#    harder for the Linear Regression model to generalize.

# Recommendations:
# - Use this model as a baseline to compare with other advanced algorithms.
# - Consider using non-linear models like Random Forest or Logistic Regression 
#   for better performance in future tasks.
# - Address class imbalance using techniques like SMOTE for improved learning.
# - Perform feature selection or engineering to identify stronger predictors of fraud.
