# Import the necessary libraries

In [8]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
# Cell 1: Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
 

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from load_csv_data import Load_CSV_Data


# Load the data

In [9]:
df = Load_CSV_Data('../data/updated_data.csv')
df.load_csv_data()
df = df.get_data()


Data successfully loaded from ../data/updated_data.csv


  self.data = pd.read_csv(self.file_path)


In [23]:
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ChannelId,...,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,Recency,Frequency,Monetary,TransactionMonth,Seasonality,RFMS_Score,UserCategory
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,1762,0,256.0,5,1,2,...,False,False,False,2063,118,-5.537408,11.0,11.0,725.154197,Good
1,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,2009,0,256.0,6,0,2,...,False,False,False,2143,2,-0.100857,11.0,11.0,714.966381,Bad
2,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,3039,0,256.0,0,3,2,...,False,False,True,2067,37,-0.15572,11.0,12.0,701.281427,Bad
3,TransactionId_23223,BatchId_25954,AccountId_1078,SubscriptionId_4238,2424,0,256.0,5,8,2,...,False,False,False,2152,1,-0.024647,11.0,11.0,717.658451,Bad
4,TransactionId_118063,BatchId_118460,AccountId_2442,SubscriptionId_1980,365,0,256.0,4,8,2,...,False,False,False,2067,29,-0.822556,11.0,12.0,698.392481,Bad


# Modeling

###  Sample Data Creation

In [6]:
# # Cell 2: Sample Data Creation
# data = {
#     "TransactionId": ["TransactionId_76871", "TransactionId_26203", "TransactionId_380"],
#     "BatchId": ["BatchId_36123", "BatchId_53941", "BatchId_102363"],
#     "AccountId": ["AccountId_3957", "AccountId_4229", "AccountId_648"],
#     "SubscriptionId": ["SubscriptionId_887", "SubscriptionId_222", "SubscriptionId_2185"],
#     "CustomerId": [1762, 2009, 3039],
#     "CurrencyCode": [0, 0, 0],
#     "CountryCode": [256.0, 256.0, 256.0],
#     "ProviderId": [5, 6, 0],
#     "ProductId": [1, 0, 3],
#     "ChannelId": [2, 2, 2],
#     "Recency": [2063, 2143, 2067],
#     "Frequency": [118, 2, 37],
#     "Monetary": [-5.537408, -0.100857, -0.155720],
#     "TransactionMonth": [11.0, 11.0, 12.0],
#     "Seasonality": [725.154197, 714.966381, 701.281427],
#     "RFMS_Score": [725.154197, 714.966381, 701.281427],
#     "UserCategory": ["Good", "Good", "Good"]
# }

# df = pd.DataFrame(data)


### Preprocessing Function


In [11]:
# Cell 3: Preprocessing Function
def preprocess_data(df, target_column='UserCategory'):
    # Identify features and target
    X = df.drop(columns=[target_column, 'Recency', 'Frequency', 'Monetary', 'Seasonality', 'RFMS_Score'])
    y = df[target_column]
    
    # Identify categorical and numerical columns
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    
    # Create preprocessing pipelines
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Fit and transform the features
    X_processed = preprocessor.fit_transform(X)
    
    return X_processed, y


### Data Splitting Function

In [12]:
# Cell 4: Data Splitting Function
def split_data(df, target_column='UserCategory', test_size=0.2, random_state=42):
    # Preprocess the data
    X, y = preprocess_data(df, target_column)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Show sample data output after split
    print("Training Data Sample Shape:", X_train.shape)
    print("Test Data Sample Shape:", X_test.shape)
    
    return X_train, X_test, y_train, y_test


### Model Training Function

In [13]:
# Cell 5: Model Training Function
def train_models(X_train, y_train):
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Random Forest': RandomForestClassifier()
    }
    
    results = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        results[model_name] = model

        y_pred = model.predict(X_test)
        print(f"Model: {model_name}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))
    
    return results


### Model Evaluation Function

In [15]:
# Cell 6: Model Evaluation Function
def evaluate_models(models, X_test, y_test):
    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        print(f"Model: {model_name}")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))


 

### Run the Process

In [14]:
# Cell 7: Run the Process
X_train, X_test, y_train, y_test = split_data(df, target_column='UserCategory')
models = train_models(X_train, y_train)


Training Data Sample Shape: (76528, 292323)
Test Data Sample Shape: (19132, 292323)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Logistic Regression
Accuracy: 0.9742316537737822
ROC AUC Score: 0.9944843573316978
Confusion Matrix:
 [[9306  187]
 [ 306 9333]]
Classification Report:
               precision    recall  f1-score   support

         Bad       0.97      0.98      0.97      9493
        Good       0.98      0.97      0.97      9639

    accuracy                           0.97     19132
   macro avg       0.97      0.97      0.97     19132
weighted avg       0.97      0.97      0.97     19132

Model: Random Forest
Accuracy: 0.9962366715450554
ROC AUC Score: 0.9997462488317462
Confusion Matrix:
 [[9493    0]
 [  72 9567]]
Classification Report:
               precision    recall  f1-score   support

         Bad       0.99      1.00      1.00      9493
        Good       1.00      0.99      1.00      9639

    accuracy                           1.00     19132
   macro avg       1.00      1.00      1.00     19132
weighted avg       1.00      1.00      1.00     19132



### Evaluate the models

In [16]:
evaluate_models(models, X_test, y_test)

Model: Logistic Regression
Accuracy: 0.9742316537737822
              precision    recall  f1-score   support

         Bad       0.97      0.98      0.97      9493
        Good       0.98      0.97      0.97      9639

    accuracy                           0.97     19132
   macro avg       0.97      0.97      0.97     19132
weighted avg       0.97      0.97      0.97     19132

Model: Random Forest
Accuracy: 0.9962366715450554
              precision    recall  f1-score   support

         Bad       0.99      1.00      1.00      9493
        Good       1.00      0.99      1.00      9639

    accuracy                           1.00     19132
   macro avg       1.00      1.00      1.00     19132
weighted avg       1.00      1.00      1.00     19132

