In [1]:
import pandas as pd 

In [2]:
clv = pd.read_csv('clv_final_dataset.csv')

In [3]:
clv.head()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls,clv_label
0,39,932.0,14,17,5,High
1,49,557.0,1,6,10,Medium
2,14,185.0,4,3,6,Low
3,38,396.0,21,29,7,Low
4,32,617.0,20,20,5,Medium


# Training the ML model 

### Seperate the features and targets 

In [4]:
x = clv.drop(columns=['clv_label'])
y = clv['clv_label']

In [10]:
print(f'Features \n\n - {x.head(10)}')

Features 

 -    tenure  total_spend  usage_frequency  last_interaction  support_calls
0      39        932.0               14                17              5
1      49        557.0                1                 6             10
2      14        185.0                4                 3              6
3      38        396.0               21                29              7
4      32        617.0               20                20              5
5      33        129.0               25                 8              9
6      49        821.0               12                24              3
7      37        445.0                8                30              4
8      12        969.0                5                13              7
9       3        415.0               25                29              2


In [9]:
print(f'Labels \n \n - {y.head(10)}')

Labels 
 
 - 0      High
1    Medium
2       Low
3       Low
4    Medium
5       Low
6      High
7       Low
8       Low
9       Low
Name: clv_label, dtype: object


### Encoding target

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
label_encoder = LabelEncoder() 
y_encoded = label_encoder.fit_transform(y)

print(label_encoder.classes_)

['High' 'Low' 'Medium']


### Train test split 

In [13]:
from sklearn.model_selection import train_test_split 

In [14]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y_encoded, test_size= 0.2, random_state= 42, stratify= y_encoded
)

### Logistic regression 

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
log_model = LogisticRegression(
    multi_class= 'multinomial', max_iter= 1000, n_jobs= -1
)

log_model.fit(x_train, y_train)



In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = log_model.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9998752367665906

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     29389
           1       1.00      1.00      1.00     29389
           2       1.00      1.00      1.00     29389

    accuracy                           1.00     88167
   macro avg       1.00      1.00      1.00     88167
weighted avg       1.00      1.00      1.00     88167


Confusion Matrix:
 [[29386     0     3]
 [    0 29386     3]
 [    2     3 29384]]


In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(x_train, y_train)

In [21]:
y_pred_rf = rf_model.predict(x_test)

print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRF Classification Report:\n", classification_report(y_test, y_pred_rf))

RF Accuracy: 0.9329681173228078

RF Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.95     29389
           1       0.97      0.93      0.95     29389
           2       0.88      0.93      0.90     29389

    accuracy                           0.93     88167
   macro avg       0.93      0.93      0.93     88167
weighted avg       0.93      0.93      0.93     88167



In [23]:

feature_importance = pd.DataFrame({
    'feature': x.columns,
    'importance': rf_model.feature_importances_
}).sort_values(by='importance', ascending=False)

feature_importance

Unnamed: 0,feature,importance
1,total_spend,0.369386
0,tenure,0.299949
2,usage_frequency,0.156312
3,last_interaction,0.108543
4,support_calls,0.065809


In [24]:
import joblib

joblib.dump(rf_model, 'clv_model.pkl')
joblib.dump(label_encoder, 'clv_label_encoder.pkl')

print("CLV model and encoder saved.")

CLV model and encoder saved.
