In [3]:
# Mount Google Drive if you're using files from there
from google.colab import drive
drive.mount('/content/drive')

# Install required libraries if not already installed
!pip install scorecardpy
!pip install scikit-learn


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import scorecardpy as sc

In [5]:
df = pd.read_csv("/content/drive/My Drive/Kifiya/week 6/processed_data.csv")
df.head(4)

Unnamed: 0,transactionid,batchid,accountid,subscriptionid,customerid,currencycode,providerid,productid,productcategory,channelid,...,stddevtransactionamount,transactionhour,transactionday,transactionmonth,transactionyear,recency,frequency,monetary,rfms_score,label
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,5,1,0,ChannelId_3,...,-0.167524,12,6,1,0,90,119,20.244034,229.244034,1
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,3,19,2,ChannelId_2,...,-0.167524,12,6,1,0,90,119,20.244034,229.244034,1
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,5,0,0,ChannelId_3,...,-0.201719,12,6,1,0,90,2,0.330244,92.330244,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,0,11,8,ChannelId_3,...,-0.008737,17,6,1,0,90,38,6.671549,134.671549,0


In [6]:
# Define the target variable
target_column = 'label'

# Perform Information Value (IV) calculation
iv_values = sc.iv(df, y=target_column)

# Display the IV values
print("Information Value Results:")
print(iv_values)

# Check the parameters for var_filter
# Update this section based on the actual parameters accepted by var_filter
filtered_vars = sc.var_filter(df, y=target_column
)  # Maximum identical value rate allowed

print("Filtered Variables:")
print(filtered_vars)

dg= filtered_vars


Information Value Results:
                   variable  info_value
8                 frequency   10.493208
1          transactioncount   10.493208
10   totaltransactionamount    8.772814
15     avgtransactionamount    8.575190
2                  monetary    8.382191
16  stddevtransactionamount    8.359590
21               customerid    7.862147
14                accountid    4.515308
12           subscriptionid    4.408941
13               rfms_score    3.619715
9                    amount    0.875216
18                  recency    0.488890
6            transactionday    0.227850
20          pricingstrategy    0.225927
3          transactionmonth    0.195845
23          transactionyear    0.182657
25               providerid    0.122503
5                 productid    0.096626
0           transactionhour    0.057721
24                  batchid    0.022860
19          productcategory    0.019628
4      transactionstarttime    0.018478
11            transactionid    0.011091
17           

In [20]:
dg.columns

Index(['transactionhour', 'transactioncount', 'monetary', 'transactionmonth',
       'productid', 'transactionday', 'frequency', 'amount',
       'totaltransactionamount', 'subscriptionid', 'rfms_score', 'accountid',
       'avgtransactionamount', 'stddevtransactionamount', 'recency',
       'pricingstrategy', 'customerid', 'transactionyear', 'batchid',
       'providerid', 'label'],
      dtype='object')

In [7]:
# X = dg.drop(columns=['label'])  # Features
# y = dg['label']  #  Target variable
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Step 1: Split the data into features (X) and target (y)
# Assuming 'dg' is your dataset and 'label' is your target column
X = dg.drop(columns=['label'])  # Features
y = dg['label']  # Target variable

# Step 2: Split the data into training and testing sets (stratify to maintain label distribution)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Handle categorical columns
categorical_columns = ['subscriptionid', 'accountid', 'customerid', 'batchid']

# Combine train and test sets for consistent encoding
combined = pd.concat([X_train[categorical_columns], X_test[categorical_columns]], axis=0)

# Apply label encoding to the categorical columns
for col in categorical_columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

# Step 4: Split back into train and test sets after encoding
X_train[categorical_columns] = combined.iloc[:X_train.shape[0]][categorical_columns]
X_test[categorical_columns] = combined.iloc[X_train.shape[0]:][categorical_columns]

# Step 5: Handle missing values (if any) for numerical columns
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

# Step 6: Scale numerical features
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])


In [24]:
print(X_train.dtypes)
print(X_test.dtypes)

transactionhour            float64
transactioncount           float64
monetary                   float64
transactionmonth           float64
productid                  float64
transactionday             float64
frequency                  float64
amount                     float64
totaltransactionamount     float64
subscriptionid             float64
rfms_score                 float64
accountid                  float64
avgtransactionamount       float64
stddevtransactionamount    float64
recency                    float64
pricingstrategy            float64
customerid                 float64
transactionyear            float64
batchid                    float64
providerid                 float64
dtype: object
transactionhour            float64
transactioncount           float64
monetary                   float64
transactionmonth           float64
productid                  float64
transactionday             float64
frequency                  float64
amount                     float64
totalt

In [25]:


def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    roc_auc = roc_auc_score(y_test, y_pred_prob)
    print(f'ROC AUC Score: {roc_auc}\n')

# Model selection
# Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000)
print("Logistic Regression Model Performance:")
evaluate_model(logistic_model, X_train, X_test, y_train, y_test)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
print("Random Forest Model Performance:")
evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# Hyperparameter Tuning for Logistic Regression
logistic_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

logistic_grid_search = GridSearchCV(LogisticRegression(max_iter=1000),
                                     param_grid=logistic_param_grid,
                                     cv=5,
                                     scoring='roc_auc',
                                     n_jobs=-1)
logistic_grid_search.fit(X_train, y_train)
print(f"Best parameters for Logistic Regression: {logistic_grid_search.best_params_}")

# Hyperparameter Tuning for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                               param_grid=rf_param_grid,
                               cv=5,
                               scoring='roc_auc',
                               n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
print(f"Best parameters for Random Forest: {rf_grid_search.best_params_}")

# Evaluate the best Random Forest model
best_rf_model = rf_grid_search.best_estimator_
print("Best Random Forest Model Performance:")
evaluate_model(best_rf_model, X_train, X_test, y_train, y_test)

# Optional: You can also evaluate the best Logistic Regression model
best_logistic_model = logistic_grid_search.best_estimator_
print("Best Logistic Regression Model Performance:")
evaluate_model(best_logistic_model, X_train, X_test, y_train, y_test)

Logistic Regression Model Performance:
Confusion Matrix:
[[9563    6]
 [ 111 9453]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9569
           1       1.00      0.99      0.99      9564

    accuracy                           0.99     19133
   macro avg       0.99      0.99      0.99     19133
weighted avg       0.99      0.99      0.99     19133

ROC AUC Score: 0.9999537686150983

Random Forest Model Performance:
Confusion Matrix:
[[9569    0]
 [   0 9564]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9569
           1       1.00      1.00      1.00      9564

    accuracy                           1.00     19133
   macro avg       1.00      1.00      1.00     19133
weighted avg       1.00      1.00      1.00     19133

ROC AUC Score: 1.0

Best parameters for Logistic Regression: {'C': 100, 'solver': 'liblinear'}
Best p

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# Function to evaluate model performance
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on test data
    y_pred_prob = model.predict_proba(X_test)[:, 1]  # Predict probabilities

    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # ROC AUC Score
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    print(f'ROC AUC Score: {roc_auc}\n')

# Logistic Regression model with hyperparameter tuning
logistic_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}
logistic_grid_search = GridSearchCV(LogisticRegression(max_iter=1000),
                                   param_grid=logistic_param_grid,
                                   cv=5,
                                   scoring='roc_auc',
                                   n_jobs=-1)
logistic_grid_search.fit(X_train, y_train)

# Get the best Logistic Regression model
best_logistic_model = logistic_grid_search.best_estimator_

# Random Forest model with hyperparameter tuning
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                               param_grid=rf_param_grid,
                               cv=5,
                               scoring='roc_auc',
                               n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Get the best Random Forest model
best_rf_model = rf_grid_search.best_estimator_

# Train and evaluate the best Logistic Regression model
print("Best Logistic Regression Model Performance:")
evaluate_model(best_logistic_model, X_train, X_test, y_train, y_test)

# Train and evaluate the best Random Forest model
print("Best Random Forest Model Performance:")
evaluate_model(best_rf_model, X_train, X_test, y_train, y_test)


Best Logistic Regression Model Performance:
Confusion Matrix:
[[9562    7]
 [  14 9550]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9569
           1       1.00      1.00      1.00      9564

    accuracy                           1.00     19133
   macro avg       1.00      1.00      1.00     19133
weighted avg       1.00      1.00      1.00     19133

ROC AUC Score: 0.9999954872224144

Best Random Forest Model Performance:
Confusion Matrix:
[[9569    0]
 [   0 9564]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9569
           1       1.00      1.00      1.00      9564

    accuracy                           1.00     19133
   macro avg       1.00      1.00      1.00     19133
weighted avg       1.00      1.00      1.00     19133

ROC AUC Score: 1.0



In [30]:
import joblib

# Save the models to Google Drive
joblib.dump(best_logistic_model, '/content/drive/MyDrive/best_logistic_model.pkl')
joblib.dump(best_rf_model, '/content/drive/MyDrive/best_rf_model.pkl')



# # Loading the models for future predictions
# loaded_logistic_model = joblib.load('best_logistic_model.pkl')
# loaded_rf_model = joblib.load('best_rf_model.pkl')


['/content/drive/MyDrive/best_rf_model.pkl']