In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import datetime as datetime
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('train.csv')

In [11]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the Haversine distance between two points on the Earth.
    Args:
        lat1, lon1: Latitude and longitude of the first location (cardholder).
        lat2, lon2: Latitude and longitude of the second location (merchant).
    Returns:
        Distance in kilometers.
    """
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Assuming your DataFrame is named `df`
# Ensure 'trans_date' and 'trans_time' are in proper datetime formats
df['trans_datetime'] = pd.to_datetime(df['trans_date'] + ' ' + df['trans_time'])
df['distance'] = df.apply(
    lambda row: haversine_distance(row['lat1'], row['long1'], row['lat2'], row['long2']), axis=1
)
# Group by credit card number (`cc_num`) to compute user-specific features
grouped = df.groupby('cc_num')

# --- Transaction Amount Features ---
# User average and standard deviation of transaction amounts
df['user_avg_amt'] = grouped['amt'].transform('mean')
df['user_std_amt'] = grouped['amt'].transform('std')
df['deviation_from_mean_amt'] = df['amt'] - df['user_avg_amt']
df['amt_z_score'] = (df['amt'] - df['user_avg_amt']) / df['user_std_amt']

# --- Time-Based Features ---
df['time_since_last_trans'] = grouped['trans_datetime'].transform(lambda x: x.diff().dt.total_seconds())
df['avg_time_between_trans'] = grouped['time_since_last_trans'].transform('mean')
df['deviation_from_mean_time'] = df['time_since_last_trans'] - df['avg_time_between_trans']

# Day and hour of the transaction
df['day_of_week'] = df['trans_datetime'].dt.dayofweek
df['hour_of_day'] = df['trans_datetime'].dt.hour

# Binary flag for unusual frequency (transactions within a short interval, e.g., 5 minutes)
df['unusual_frequency'] = df['time_since_last_trans'] < 300

# --- Category-Based Features ---
# Previous transaction category for each user
df['prev_category'] = grouped['category'].shift(1).astype('category')
df['next_category'] = grouped['category'].shift(-1).astype('category')
df['same_category_as_prev'] = df['prev_category'] == df['category']
df['same_category_as_next'] = df['next_category'] == df['category']
df['at_least_one_same_category'] = df['same_category_as_prev'] | df['same_category_as_next']

# df['category_transition'] = df['prev_category'] + '->' + df['category']

# Frequency of category transitions
# category_transition_counts = df.groupby(['cc_num', 'category_transition'])['trans_num'].count().reset_index(name='transition_count')
# df = df.merge(category_transition_counts, on=['cc_num', 'category_transition'], how='left')

# Category entropy for the user's transactions
def calculate_entropy(categories):
    probs = categories.value_counts(normalize=True)
    return -np.sum(probs * np.log(probs))

df['category_entropy'] = grouped['category'].transform(calculate_entropy)

# --- General Features ---
# Distance between transaction location and merchant location
df['distance_to_merchant'] = np.sqrt((df['lat'] - df['merch_lat'])**2 + (df['long'] - df['merch_long'])**2)

# Is the transaction on a weekend?
df['is_weekend'] = df['trans_datetime'].dt.dayofweek >= 5

# --- Rolling Features ---
# Rolling window for transaction amounts
window_size = 5
df['rolling_avg_amt'] = grouped['amt'].transform(lambda x: x.rolling(window_size, min_periods=1).mean())
df['rolling_std_amt'] = grouped['amt'].transform(lambda x: x.rolling(window_size, min_periods=1).std())

# Rolling count of category changes
df['rolling_category_changes'] = grouped['category'].transform(lambda x: (x != x.shift(1)).rolling(window_size, min_periods=1).sum())

# Rolling time between transactions
df['rolling_time_between_trans'] = grouped['time_since_last_trans'].transform(lambda x: x.rolling(window_size, min_periods=1).mean())

In [70]:
from sklearn.preprocessing import LabelEncoder


# Drop unnecessary columns
features_to_use = ['user_avg_amt', 'user_std_amt', 'deviation_from_mean_amt', 'amt_z_score', 
                   'time_since_last_trans', 'avg_time_between_trans', 'deviation_from_mean_time', 
                   'day_of_week', 'hour_of_day', 'unusual_frequency', 'prev_category', 'next_category',
                   'same_category_as_prev', 'same_category_as_next', 'at_least_one_same_category',
                   'category_entropy', 'distance_to_merchant', 'is_weekend', 
                   'rolling_avg_amt', 'rolling_std_amt', 'rolling_category_changes', 
                   'rolling_time_between_trans']
bool_columns = ['unusual_frequency', 'same_category_as_prev', 'same_category_as_next', 
                'at_least_one_same_category', 'is_weekend', 'day_of_week_1', 'day_of_week_2', 
                'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6']

X = df[features_to_use]
y = df['is_fraud']

# One-hot encode categorical columns if necessary

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

bool_columns = X_train.select_dtypes(include=['bool']).columns
X_train[bool_columns] = X_train[bool_columns].astype(int)
X_train = pd.get_dummies(X_train, columns=['prev_category', 'next_category'], drop_first=True)
X_train.fillna(0, inplace=True)

bool_columns_test = X_test.select_dtypes(include=['bool']).columns
X_test[bool_columns_test] = X_test[bool_columns_test].astype(int)
X_test = pd.get_dummies(X_test, columns=['prev_category', 'next_category'], drop_first=True)
X_test.fillna(0, inplace=True)

# Initialize Decision Tree
dt_model = DecisionTreeClassifier(
    criterion='gini',  # 'entropy' for information gain
    max_depth=5,       # Limit tree depth for better generalization
    min_samples_split=10,  # Minimum samples required to split an internal node
    random_state=42
)

# Train the Decision Tree
dt_model.fit(X_train, y_train)

y_pred_prob = dt_model.predict_proba(X_test)[:, 1]  # Probability of being fraud
y_pred = dt_model.predict(X_test)         

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_prob))


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     98521
           1       0.88      0.64      0.74     12690

    accuracy                           0.95    111211
   macro avg       0.92      0.81      0.86    111211
weighted avg       0.95      0.95      0.95    111211

ROC-AUC Score: 0.9312700342398191


In [71]:
final_train = pd.read_csv('train.csv')
final_test = pd.read_csv('test.csv')
final_df = pd.concat([final_train, final_test], axis=0)


# Assuming your DataFrame is named `df`
# Ensure 'trans_date' and 'trans_time' are in proper datetime formats
final_df['trans_datetime'] = pd.to_datetime(final_df['trans_date'] + ' ' + final_df['trans_time'])

# Group by credit card number (`cc_num`) to compute user-specific features
grouped = final_df.groupby('cc_num')

# --- Transaction Amount Features ---
# User average and standard deviation of transaction amounts
final_df['user_avg_amt'] = grouped['amt'].transform('mean')
final_df['user_std_amt'] = grouped['amt'].transform('std')
final_df['deviation_from_mean_amt'] = final_df['amt'] - final_df['user_avg_amt']
final_df['amt_z_score'] = (final_df['amt'] - final_df['user_avg_amt']) / final_df['user_std_amt']

# --- Time-Based Features ---
final_df['time_since_last_trans'] = grouped['trans_datetime'].transform(lambda x: x.diff().dt.total_seconds())
final_df['avg_time_between_trans'] = grouped['time_since_last_trans'].transform('mean')
final_df['deviation_from_mean_time'] = final_df['time_since_last_trans'] - final_df['avg_time_between_trans']

# Day and hour of the transaction
final_df['day_of_week'] = final_df['trans_datetime'].dt.dayofweek
final_df['hour_of_day'] = final_df['trans_datetime'].dt.hour

# Binary flag for unusual frequency (transactions within a short interval, e.g., 5 minutes)
final_df['unusual_frequency'] = final_df['time_since_last_trans'] < 300

# --- Category-Based Features ---
# Previous transaction category for each user
final_df['prev_category'] = grouped['category'].shift(1).astype('category')
final_df['next_category'] = grouped['category'].shift(-1).astype('category')
final_df['same_category_as_prev'] = final_df['prev_category'] == final_df['category']
final_df['same_category_as_next'] = final_df['next_category'] == final_df['category']
final_df['at_least_one_same_category'] = final_df['same_category_as_prev'] | final_df['same_category_as_next']

# final_df['category_transition'] = final_df['prev_category'] + '->' + final_df['category']

# Frequency of category transitions
# category_transition_counts = final_df.groupby(['cc_num', 'category_transition'])['trans_num'].count().reset_index(name='transition_count')
# final_df = final_df.merge(category_transition_counts, on=['cc_num', 'category_transition'], how='left')

# Category entropy for the user's transactions
def calculate_entropy(categories):
    probs = categories.value_counts(normalize=True)
    return -np.sum(probs * np.log(probs))

final_df['category_entropy'] = grouped['category'].transform(calculate_entropy)

# --- General Features ---
# Distance between transaction location and merchant location
final_df['distance_to_merchant'] = np.sqrt((final_df['lat'] - final_df['merch_lat'])**2 + (final_df['long'] - final_df['merch_long'])**2)

# Is the transaction on a weekend?
final_df['is_weekend'] = final_df['trans_datetime'].dt.dayofweek >= 5

# --- Rolling Features ---
# Rolling window for transaction amounts
window_size = 5
final_df['rolling_avg_amt'] = grouped['amt'].transform(lambda x: x.rolling(window_size, min_periods=1).mean())
final_df['rolling_std_amt'] = grouped['amt'].transform(lambda x: x.rolling(window_size, min_periods=1).std())

# Rolling count of category changes
final_df['rolling_category_changes'] = grouped['category'].transform(lambda x: (x != x.shift(1)).rolling(window_size, min_periods=1).sum())

# Rolling time between transactions
final_df['rolling_time_between_trans'] = grouped['time_since_last_trans'].transform(lambda x: x.rolling(window_size, min_periods=1).mean())

In [73]:
final = final_df[final_df['id'].isin(final_test['id'])]
final_testing = final[features_to_use]

bool_columns_final = final_testing.select_dtypes(include=['bool']).columns
final_testing[bool_columns_final] = final_testing[bool_columns_final].astype(int)
final_testing = pd.get_dummies(final_testing, columns=['prev_category', 'next_category'], drop_first=True)
final_testing.fillna(0, inplace=True)

y_pred_prob = dt_model.predict_proba(final_testing)[:, 1]  # Probability of being fraud
y_pred = dt_model.predict(final_testing) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_testing[bool_columns_final] = final_testing[bool_columns_final].astype(int)


In [75]:
output_df = pd.DataFrame({
    'id': final['id'],
    'is_fraud': y_pred
})

# Writing to a CSV file
output_df.to_csv('sample_submission.csv', index=False)