<a href="https://colab.research.google.com/github/zxita/chargeback_predict/blob/main/cb_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd /content/gdrive/MyDrive/Colab Notebooks/

Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from itertools import count
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score
import numpy as np


# Define dataset paths
transaction_file = "transactions sample.xlsx"
chargeback_file = "chargeback sample.xlsx"

# Load the data
transaction_data = pd.read_excel(transaction_file)
chargeback_data = pd.read_excel(chargeback_file)

# Merging the data based on common identifier 'merchant_reference_number'
merged_data = pd.merge(transaction_data, chargeback_data,
                       on="merchant_reference_number",
                       how="left",
                       suffixes=("_transaction", "_chargeback")
                      )

# Create the target variable (1 for chargeback, 0 otherwise)
merged_data['Chargeback'] = np.where(merged_data['chargeback_reason'].notnull(), 1, 0)
#merged_data['chargeback_occurred'] = np.where(merged_data['retrieval_reference_number'].isin(chargeback_data['retrieval_reference_number']), 1, 0)


# Clean the dataset by dropping unnecessary columns
columns_to_drop = ['payment_month','request_id', 'merchant_reference_number','request_id','paid_currency',
                  'account_suffix', 'applications','transaction_reference_number','bill_payment_month',
                   'chargeback_currency','chargeback_amount']
merged_data.drop(columns=columns_to_drop, inplace=True)

#Exporting the merged and clean file
#merged_data.to_excel('cleaned_merged_data.xlsx', index=False)

# Encode categorical variables (one-hot encoding)
merged_data = pd.get_dummies(merged_data, columns=['chargeback_reason', 'payment_method'], drop_first=True)
print(merged_data.head())

# Split data into features and target
X = merged_data.drop(columns=["Chargeback"])
y = merged_data["Chargeback"]

print(X.head())
print(y.head())

#Checking the sample size
unique,counts = np.unique(y,return_counts=True)
y_sample_size = {i:j for (i,j) in zip(unique,counts)}
print(y_sample_size)
#print(unique,counts)



#SMOTE oversampling to handle class imbalance

#Handling categorical data (in X)
X = pd.get_dummies(X,columns=['service_name'], drop_first=True)
#Verify if there are any left categorical values left
print(X.select_dtypes(include=['object', 'category']).columns)
print(X.dtypes)

# Check for and handle missing values
print("Missing values per column before handling:")
print(X.isnull().sum())

# Fill missing values with 0
X = X.fillna(0)

# Verify no missing values remain
print("Missing values per column after handling:")
print(X.isnull().sum().sum())


# Apply SMOTE
smote = SMOTE(random_state=12)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("SMOTE applied successfully!")


# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))




In [None]:
# Ensuring we split the data before training

# Split the original dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(X_train.head())
print(y_train.head())

# Apply SMOTE on the training data
smote = SMOTE(random_state=12)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model again
print("Classification Report:")
print(classification_report(y_test, y_pred))

y_pred_prob = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC AUC Score: {roc_auc}")


       retrieval_reference_number_transaction  paid_amount  \
15370                            3.153157e+11        100.0   
19739                            3.169033e+11       1500.0   
74457                            4.148055e+11        200.0   
18625                            3.174168e+11         50.0   
75410                            4.151189e+11       9000.0   

       retrieval_reference_number_chargeback  \
15370                                    0.0   
19739                                    0.0   
74457                                    0.0   
18625                                    0.0   
75410                                    0.0   

       chargeback_reason_11.3 VCR No Authorization  \
15370                                        False   
19739                                        False   
74457                                        False   
18625                                        False   
75410                                        False   

       charge

In [None]:
# Model cross validation
from sklearn.model_selection import cross_val_score

# Perform cross-validation (on the original, unresampled training set)
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-Validation AUC Scores:", cv_scores)
print("Mean CV AUC Score:", cv_scores.mean())


Cross-Validation AUC Scores: [1. 1. 1. 1. 1.]
Mean CV AUC Score: 1.0


In [None]:
# Testing on a whole new dataset
new_data = pd.read_csv("new_transactions.csv")

# Preprocess the new data (same as training data)
# Example: Handle missing values, encode categorical variables, etc.
new_data = new_data.fillna(0)  # Replace NaNs with 0
new_data = pd.get_dummies(new_data, drop_first=True)

# Ensure the columns match the training data
new_data = new_data.reindex(columns=X_train.columns, fill_value=0)

# Make predictions on the new dataset
new_predictions = model.predict(new_data)
print("Predictions on New Dataset:", new_predictions)


In [None]:
# Preparing model deployment
import joblib

# Save the model to a file
joblib.dump(model, "chargeback_model.pkl")

# Save preprocessing (e.g., column structure)
joblib.dump(X_train.columns, "model_columns.pkl")


# Build a prediction fx
def preprocess_and_predict(input_data, model_file="chargeback_model.pkl", columns_file="model_columns.pkl"):
    # Load the model and columns
    model = joblib.load(model_file)
    model_columns = joblib.load(columns_file)

    # Preprocess input data
    input_data = pd.DataFrame(input_data)
    input_data = input_data.fillna(0)
    input_data = pd.get_dummies(input_data, drop_first=True)
    input_data = input_data.reindex(columns=model_columns, fill_value=0)

    # Make predictions
    predictions = model.predict(input_data)
    probabilities = model.predict_proba(input_data)[:, 1]  # Probability of class 1 (Chargeback)
    return predictions, probabilities


In [None]:
# Model deployment with Flask
!pip install flask pyngrok
from flask import Flask, request, jsonify
import joblib

# Load the saved model
model = joblib.load("chargeback_model.pkl")
model_columns = joblib.load("model_columns.pkl")

# Define Flask app
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    input_json = request.get_json()
    predictions, probabilities = preprocess_and_predict(input_json)
    return jsonify({"predictions": predictions.tolist(), "probabilities": probabilities.tolist()})




In [None]:
!ngrok authtoken 2q4YnaVvAZpsdoVeyxMkBKVqj2Q_2pE1vopnuzcRkLJaDh38Z

from pyngrok import ngrok

# Start ngrok on the specified port
public_url = ngrok.connect(5000)  # Replace 5000 with the port your Flask app is running on
print(f"Public URL: {public_url}")

# Running the Flask app
#app.run(port=5000)
if __name__ == "__main__":
    app.run(port=5000, debug=True)


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Public URL: NgrokTunnel: "https://dcf9-34-125-195-0.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [None]:
# Keep the Flask app running
app = Flask(__name__)

@app.route("/")
def home():
    return "Flask is running!"

# Start ngrok and Flask in background
public_url = ngrok.connect(5000)
print(f"Public URL: {public_url}")
app.run(port=5000, debug=False, use_reloader=False)


Public URL: NgrokTunnel: "https://e584-34-125-195-0.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [None]:

public_url = ngrok.connect(5000)
print(f"Public URL: {public_url}")


Public URL: NgrokTunnel: "https://4cf9-34-125-195-0.ngrok-free.app" -> "http://localhost:5000"


In [None]:
import requests

# Test using the obtained public URL
url = "https://dcf9-34-125-195-0.ngrok-free.app/predict"
payload = {
    "paid_amount": 150,
    "paid_currency": "USD",
    "account_suffix": 12345,
    "applications": "Mobile",
    "payment_method": "CreditCard",
    "payment_month": "2023-10"
}
response = requests.post(url, json=payload)
print("Response:", response.json())

In [63]:
#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
import joblib
from sklearn.metrics import classification_report


# Numerical preprocessing: scaling
numerical_features = ["paid_amount"]
numerical_transformer = StandardScaler()

# Categorical preprocessing: one-hot encoding
#categorical_features = ["applications", "payment_method", "payment_month"]
#categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        #("cat", categorical_transformer, categorical_features),
    ]
)

#Create the pipeline
model = GradientBoostingClassifier() #model initialization

# Define the pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

print(X_train.head())
#X_train.head().to_excel("X_train.xlsx", index=False)
print(y_train.head())

pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Classification report
print(classification_report(y_test, y_pred))

# Save the pipeline
joblib.dump(pipeline, "chargeback_pipeline.pkl")

# Load and use the pipeline (future use)
loaded_pipeline = joblib.load("chargeback_pipeline.pkl")
new_data = ...  # Replace with actual new data
predictions = loaded_pipeline.predict(new_data)
print(predictions)

['chargeback_pipeline.pkl']