In [88]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [89]:
# local_path = "/kaggle/input/ieee-fraud-detection/sample_submission.csv"
# sample = pd.read_csv(local_path)
# print(sample.head())

In [90]:
!pip install dagshub



In [91]:
!pip install mlflow



In [92]:
import mlflow
import dagshub
dagshub.init(repo_owner='dshan21', repo_name='ML_ASS_2', mlflow=True)

In [93]:
identity_test_file_path = "/kaggle/input/ieee-fraud-detection/test_identity.csv"
identity_df = pd.read_csv(identity_test_file_path)
transaction_test_file_path = "/kaggle/input/ieee-fraud-detection/test_transaction.csv"
transaction_df = pd.read_csv(transaction_test_file_path)

In [96]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# experiment = client.get_experiment_by_name("XGBoost_Training")
experiment = client.get_experiment_by_name("LIGHTBGM_Training")
experiment_id = experiment.experiment_id

In [97]:
runs = client.search_runs(
    experiment_ids=[experiment_id],
    # filter_string="attributes.run_name = 'XGBoost_Cleaning'",
    filter_string="attributes.run_name = 'LIGHTBGM_Cleaning'",
    order_by=["attributes.start_time DESC"],
    max_results=1
)

if runs:
    latest_run = runs[0]
    run_id = latest_run.info.run_id
    print(f"Found latest run: {run_id}")
    
    artifact_path = "high_missing_columns.csv"
    local_path = client.download_artifacts(run_id, artifact_path, ".")
    print(f"Downloaded artifact to: {local_path}")
    
    high_missing_df = pd.read_csv(local_path)
    print(high_missing_df.head())
else:
    print("No runs found")

Found latest run: fe1f882a576a42c798e9431396af9f7b


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded artifact to: /kaggle/working/high_missing_columns.csv
  column_name  missing_rate
0       id_24      0.991962
1       id_25      0.991310
2       id_07      0.991271
3       id_08      0.991271
4       id_21      0.991264


In [98]:
runs = client.search_runs(
    experiment_ids=[experiment_id],
    # filter_string="attributes.run_name = 'XGBoost_Cleaning'",
    filter_string="attributes.run_name = 'LIGHTBGM_Cleaning'",
    order_by=["attributes.start_time DESC"],
    max_results=1
)

if runs:
    latest_run = runs[0]
    run_id = latest_run.info.run_id
    print(f"Found latest run: {run_id}")
    cat_encoder = client.get_run(run_id).data.params.get("categorical_col_repl_method")
    
    if cat_encoder is not None:
        print(f"Parameter 'categorical_col_repl_method': {cat_encoder}")
    else:
        print("Parameter 'categorical_col_repl_method' not found in this run")
else:
    print("No runs found for")

Found latest run: fe1f882a576a42c798e9431396af9f7b
Parameter 'categorical_col_repl_method': LabelEncoder()


In [99]:
runs = client.search_runs(
    experiment_ids=[experiment_id],
    # filter_string="attributes.run_name = 'XGBoost_Feature_Selection'",
    filter_string="attributes.run_name = 'LIGHTGBM_Feature_Selection'",
    order_by=["attributes.start_time DESC"],
    max_results=1
)

if runs:
    latest_run = runs[0]
    run_id = latest_run.info.run_id
    print(f"Found latest run: {run_id}")
    
    artifact_path = "lgbm_top_features.txt"
    local_path = client.download_artifacts(run_id, artifact_path, ".")
    print(f"Downloaded artifact to: {local_path}")
    
    top_features = open(local_path, 'r').read()
else:
    print("No runs found")

Found latest run: fbc05c4f95b5475eb90538581d2b8f6a


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloaded artifact to: /kaggle/working/lgbm_top_features.txt


In [100]:
column_list = high_missing_df["column_name"].tolist()
column_list = [column.replace('_', '-') for column in column_list]
print(column_list)

['id-24', 'id-25', 'id-07', 'id-08', 'id-21', 'id-26', 'id-22', 'id-23', 'id-27', 'dist2', 'D7', 'id-18']


In [101]:
top_features_list = top_features.split('\n')[:-1]
top_features_list = [feature.replace('_', '-') if feature.startswith('id') else feature for feature in top_features_list]
print(top_features_list)

['C13', 'TransactionAmt', 'card2', 'C1', 'transaction_day', 'card1', 'C14', 'R_emaildomain', 'D1', 'P_emaildomain', 'addr1', 'card3', 'D2', 'card5', 'D8', 'D15', 'C11', 'C6', 'C2', 'D4', 'D10', 'V258', 'dist1', 'card6', 'ProductCD', 'DeviceInfo', 'V285', 'C9', 'id-20', 'D3', 'M6', 'D11', 'id-06', 'amount_decimal', 'transaction_hour', 'M5', 'V310', 'C8', 'id-02', 'id-31', 'M4', 'V294', 'V317', 'id-19', 'id-01', 'V87', 'D5', 'V283', 'V45', 'id-14', 'id-30', 'C12', 'C10', 'C5', 'V149', 'V165', 'V99', 'V323', 'V53', 'card4', 'V129', 'V127', 'V312', 'V187', 'transaction_second', 'V156', 'V308', 'D13', 'id-17', 'V307', 'V62', 'V130', 'V102', 'id-03', 'id-38', 'V83', 'V96', 'V324', 'D9', 'V265', 'C3', 'V259', 'V76', 'D6', 'M3', 'id-05', 'V225', 'V147', 'id-11', 'V70', 'V82', 'DeviceType', 'V159', 'V206', 'V280', 'V296', 'C4', 'V128', 'transaction_minute', 'id-09', 'V136', 'V314', 'V315', 'V266', 'V143', 'V64', 'V48', 'V333', 'V61', 'V201', 'id-33', 'V172', 'V56', 'V55', 'V270', 'V133', 'V313'

In [102]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

merged_df = None
y = None

missing_cutoff = 0.9
merged_df = transaction_df.merge(identity_df, on='TransactionID', how='left')
# print(f"Merged data shape: {merged_df.shape}")
id_copy = merged_df.copy()
merged_df.drop(['TransactionID'], axis=1, inplace=True)


merged_df.drop(column_list, axis=1, inplace=True)

categorical_cols = merged_df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = merged_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# print(categorical_cols)
# print(numerical_cols)

for col in numerical_cols:
    if merged_df[col].isnull().sum() > 0:
        merged_df[col] = merged_df[col].fillna(merged_df[col].median())


for col in categorical_cols:
    if merged_df[col].isnull().sum() > 0:
        merged_df[col] = merged_df[col].fillna('Unknown')


categorical_encoder = LabelEncoder()      
mlflow.log_param("categorical_col_repl_method", categorical_encoder)

if isinstance(categorical_encoder, LabelEncoder):
    for col in categorical_cols:
        merged_df[col] = categorical_encoder.fit_transform(merged_df[col].astype(str)) 
elif isinstance(categorical_encoder, OneHotEncoder):
    for col in categorical_cols:
        encoded_array = categorical_encoder.fit_transform(merged_df[col].astype(str).values.reshape(-1, 1))
        encoded_cols = categorical_encoder.get_feature_names_out([col])
        encoded_df = pd.DataFrame(encoded_array.toarray(), columns=encoded_cols, index=merged_df.index)
        
        merged_df.drop(columns=[col], inplace=True)
        merged_df = pd.concat([merged_df, encoded_df], axis=1)


print(merged_df.columns.tolist())


['TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V8

In [103]:
newframe = merged_df.copy()
merged_df = newframe

merged_df['transaction_day'] = merged_df['TransactionDT'] // (24 * 60 * 60)
merged_df['transaction_hour'] = (merged_df['TransactionDT'] % (24 * 60 * 60)) // (60 * 60)
merged_df['transaction_minute'] = ((merged_df['TransactionDT'] % (24 * 60 * 60)) % (60 * 60)) // 60
merged_df['transaction_second'] = merged_df['TransactionDT'] % 60

merged_df['day_of_week'] = merged_df['transaction_day'] % 7
merged_df['is_weekend'] = merged_df['day_of_week'].isin([0, 6]).astype(int)

merged_df.drop('TransactionDT', axis=1, inplace=True)

merged_df['amount_log'] = np.log1p(merged_df['TransactionAmt'])
merged_df['amount_decimal'] = merged_df['TransactionAmt'] - np.floor(merged_df['TransactionAmt'])

In [104]:
print(merged_df.columns.tolist())
X_selected = merged_df[top_features_list].copy()

['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90',

In [105]:
print(X_selected.columns.tolist())

['C13', 'TransactionAmt', 'card2', 'C1', 'transaction_day', 'card1', 'C14', 'R_emaildomain', 'D1', 'P_emaildomain', 'addr1', 'card3', 'D2', 'card5', 'D8', 'D15', 'C11', 'C6', 'C2', 'D4', 'D10', 'V258', 'dist1', 'card6', 'ProductCD', 'DeviceInfo', 'V285', 'C9', 'id-20', 'D3', 'M6', 'D11', 'id-06', 'amount_decimal', 'transaction_hour', 'M5', 'V310', 'C8', 'id-02', 'id-31', 'M4', 'V294', 'V317', 'id-19', 'id-01', 'V87', 'D5', 'V283', 'V45', 'id-14', 'id-30', 'C12', 'C10', 'C5', 'V149', 'V165', 'V99', 'V323', 'V53', 'card4', 'V129', 'V127', 'V312', 'V187', 'transaction_second', 'V156', 'V308', 'D13', 'id-17', 'V307', 'V62', 'V130', 'V102', 'id-03', 'id-38', 'V83', 'V96', 'V324', 'D9', 'V265', 'C3', 'V259', 'V76', 'D6', 'M3', 'id-05', 'V225', 'V147', 'id-11', 'V70', 'V82', 'DeviceType', 'V159', 'V206', 'V280', 'V296', 'C4', 'V128', 'transaction_minute', 'id-09', 'V136', 'V314', 'V315', 'V266', 'V143', 'V64', 'V48', 'V333', 'V61', 'V201', 'id-33', 'V172', 'V56', 'V55', 'V270', 'V133', 'V313'

In [106]:
import pandas as pd
import numpy as np
import mlflow.pyfunc
from IPython.display import FileLink


mlflow.set_tracking_uri("https://dagshub.com/dshan21/ML_ASS_2.mlflow")

print("Loading model from MLflow Model Registry...")
model_name = "FraudLightGBMModel"
model_version = 1

try:
    model = mlflow.pyfunc.load_model(f"models:/{model_name}/{model_version}")
    print(f"Successfully loaded {model_name} version {model_version}")
    print(type(model))
    # print("Training columns:", list(test_engineered.columns))
    # print("Test columns:    ", list(test_features.columns))

    # X_selected_encoded = pd.get_dummies(X_selected)
    X_selected.columns = [col.replace('-', '_') for col in X_selected.columns]
    predictions = model.predict(X_selected)

    # print("Raw prediction range:", predictions.min(), "-", predictions.max())
    print("Final price range:", predictions.min(), "-", predictions.max())
    
    # print("AAAAAAAA")
    submission = pd.DataFrame({
        'TransactionID': id_copy['TransactionID'],
        'isFraud': predictions
    })
    # print("BBBBBBB")

    
    submission.to_csv('/kaggle/working/submission.csv', index=False)
    FileLink('/kaggle/working/submission.csv')
    
    print("Submission file created successfully!")
    
except Exception as e:
    print(f"Error during model loading or prediction: {e}")




Loading model from MLflow Model Registry...


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Successfully loaded FraudLightGBMModel version 1
<class 'mlflow.pyfunc.PyFuncModel'>
Final price range: 0 - 1
Submission file created successfully!
