In [38]:
import os
import numpy as np
import pandas as pd

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import average_precision_score

In [2]:
import os

# Get the root directory (one level up from the notebook directory)
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Define the data directory relative to the root
DATA_DIR = os.path.join(ROOT_DIR, "data")

print("Data directory:", DATA_DIR)  # Debugging


Data directory: c:\Users\kakai\Documents\Work\ML\Projects\challenge-data-fraud\data


In [3]:
# Function to list all files in the data directory
def list_data_files(directory):
    if not os.path.exists(directory):
        print(f"Warning: Data directory '{directory}' does not exist.")
        return []
    
    files = []
    for dirname, _, filenames in os.walk(directory):
        for filename in filenames:
            files.append(os.path.join(dirname, filename))
    return files

In [4]:
# Print all available data files
data_files = list_data_files(DATA_DIR)
if data_files:
    print("Available data files:", data_files)

Available data files: ['c:\\Users\\kakai\\Documents\\Work\\ML\\Projects\\challenge-data-fraud\\data\\X_test_8skS2ey.csv', 'c:\\Users\\kakai\\Documents\\Work\\ML\\Projects\\challenge-data-fraud\\data\\X_train_G3tdtEn.csv', 'c:\\Users\\kakai\\Documents\\Work\\ML\\Projects\\challenge-data-fraud\\data\\Y_test_random_2.csv', 'c:\\Users\\kakai\\Documents\\Work\\ML\\Projects\\challenge-data-fraud\\data\\Y_train_2_XPXJDyy.csv']


In [5]:
import pandas as pd

X_train = pd.read_csv(os.path.join(DATA_DIR, "X_train_G3tdtEn.csv"))
Y_train = pd.read_csv(os.path.join(DATA_DIR, "Y_train_2_XPXJDyy.csv"))
X_test = pd.read_csv(os.path.join(DATA_DIR, "X_test_8skS2ey.csv"))
Y_test_example = pd.read_csv(os.path.join(DATA_DIR, "Y_test_random_2.csv"))

print("Data loaded successfully!")


  X_train = pd.read_csv(os.path.join(DATA_DIR, "X_train_G3tdtEn.csv"))


Data loaded successfully!


  X_test = pd.read_csv(os.path.join(DATA_DIR, "X_test_8skS2ey.csv"))


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
X_train.head(1)

Unnamed: 0,ID,item1,item2,item3,item4,item5,item6,item7,item8,item9,...,Nbr_of_prod_purchas16,Nbr_of_prod_purchas17,Nbr_of_prod_purchas18,Nbr_of_prod_purchas19,Nbr_of_prod_purchas20,Nbr_of_prod_purchas21,Nbr_of_prod_purchas22,Nbr_of_prod_purchas23,Nbr_of_prod_purchas24,Nb_of_items
0,85517,COMPUTERS,,,,,,,,,...,,,,,,,,,,1.0


In [8]:
Y_train.head(1)

Unnamed: 0,index,ID,fraud_flag
0,0,85517,0


In [9]:
X_test.head(1)

Unnamed: 0,ID,item1,item2,item3,item4,item5,item6,item7,item8,item9,...,Nbr_of_prod_purchas16,Nbr_of_prod_purchas17,Nbr_of_prod_purchas18,Nbr_of_prod_purchas19,Nbr_of_prod_purchas20,Nbr_of_prod_purchas21,Nbr_of_prod_purchas22,Nbr_of_prod_purchas23,Nbr_of_prod_purchas24,Nb_of_items
0,64707,HEALTH BEAUTY ELECTRICAL,,,,,,,,,...,,,,,,,,,,1.0


In [10]:
Y_test_example.head(1)

Unnamed: 0,index,ID,fraud_flag
0,0,64707,0.165105


In [11]:
def clean_headers(val):
    if isinstance(val,str):
        # removes special chars(but skips empty spaces and all)
        val="".join(char for char in val if char.isalnum() or char in (" ", "_"))
        # converts to snake case
        val=val.strip().lower().replace(" ", "_")
        return val
    else:
        return val

In [12]:
X_train=X_train.rename(columns=clean_headers)

In [13]:
X_test=X_test.rename(columns=clean_headers)

In [14]:
data_train_test=pd.concat([X_test.assign(ind="test"), X_train.assign(ind="train")])

In [15]:
data_train_test.shape

(115988, 147)

In [16]:
data_train_test.head(1)

Unnamed: 0,id,item1,item2,item3,item4,item5,item6,item7,item8,item9,...,nbr_of_prod_purchas17,nbr_of_prod_purchas18,nbr_of_prod_purchas19,nbr_of_prod_purchas20,nbr_of_prod_purchas21,nbr_of_prod_purchas22,nbr_of_prod_purchas23,nbr_of_prod_purchas24,nb_of_items,ind
0,64707,HEALTH BEAUTY ELECTRICAL,,,,,,,,,...,,,,,,,,,1.0,test


In [17]:
# 'ind' is the column to exclude.
columns_to_exclude = ['ind']

In [18]:
# Select categorical columns excluding the ones to exclude
categorical_columns = [col for col in data_train_test.select_dtypes(['object', 'category']).columns if col not in columns_to_exclude]

In [19]:
# Handles missing values in categorical columns
data_train_test[categorical_columns] = data_train_test[categorical_columns].fillna('missing')

In [20]:
# Label encoding for categorical columns using the same LabelEncoder
le = LabelEncoder()
data_train_test[categorical_columns] = data_train_test[categorical_columns].astype(str)
data_train_test[categorical_columns] = data_train_test[categorical_columns].apply(lambda col: le.fit_transform(col))

In [21]:
numerical_cols = data_train_test.select_dtypes(include=['float64', 'int64']).columns
data_train_test[numerical_cols] = data_train_test[numerical_cols].fillna(0)

In [22]:
data_train_test.head(1)

Unnamed: 0,id,item1,item2,item3,item4,item5,item6,item7,item8,item9,...,nbr_of_prod_purchas17,nbr_of_prod_purchas18,nbr_of_prod_purchas19,nbr_of_prod_purchas20,nbr_of_prod_purchas21,nbr_of_prod_purchas22,nbr_of_prod_purchas23,nbr_of_prod_purchas24,nb_of_items,ind
0,64707,63,139,135,127,111,104,100,95,85,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,test


In [23]:
X_test_clean, X_train_clean = data_train_test[data_train_test["ind"].eq("test")], data_train_test[data_train_test["ind"].eq("train")]

In [24]:
X_train_clean=X_train_clean.drop(['ind', 'id'], axis=1)

In [25]:
X_train_clean.head(1)

Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,item8,item9,item10,...,nbr_of_prod_purchas16,nbr_of_prod_purchas17,nbr_of_prod_purchas18,nbr_of_prod_purchas19,nbr_of_prod_purchas20,nbr_of_prod_purchas21,nbr_of_prod_purchas22,nbr_of_prod_purchas23,nbr_of_prod_purchas24,nb_of_items
0,38,139,135,127,111,104,100,95,85,79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [26]:
X_test_clean=X_test_clean.drop(['ind', 'id'], axis=1)

In [27]:
# Split the data into train and test sets
X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(X_train_clean, Y_train["fraud_flag"], test_size=0.2, random_state=42)

In [28]:
# Apply SMOTE for balancing
smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train_encoded, y_train)

In [29]:
# Train RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_sm, y_train_sm)

In [30]:
# Predict probabilities on the test set
y_pred_prob = clf.predict_proba(X_test_encoded)[:, 1]

In [31]:
# Evaluate model performance
average_score = average_precision_score(y_test, y_pred_prob)
print("Average Precision Score:", average_score)

Average Precision Score: 0.13945575434505866


In [32]:
# Predict probabilities on the actual test set
y_pred_final = clf.predict_proba(X_test_clean)[:, 1]

In [33]:
y_pred_final

array([0.        , 0.        , 0.        , ..., 0.67388955, 0.        ,
       0.13091162], shape=(23198,))

In [34]:
final_result=pd.concat([X_test['id'],pd.Series(y_pred_final)], axis=1)

In [35]:
# Remaing the id column
final_result.rename(columns={'id': 'ID'}, inplace=True)

In [36]:
final_result

Unnamed: 0,ID,0
0,64707,0.000000
1,63919,0.000000
2,15664,0.000000
3,6626,0.000000
4,26766,0.000000
...,...,...
23193,63474,0.000000
23194,80438,0.000000
23195,29485,0.673890
23196,59838,0.000000


In [37]:
# Save the final result in the submission folder
final_result.to_csv('../submissions/final_result.csv', index=True)
