In [1]:
import pandas as pd 

In [2]:
clv_df = pd.read_csv('clv_cleaned_dataset.csv')

In [3]:
clv_df.head()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls
0,39,932.0,14,17,5
1,49,557.0,1,6,10
2,14,185.0,4,3,6
3,38,396.0,21,29,7
4,32,617.0,20,20,5


In [4]:
clv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440832 entries, 0 to 440831
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   tenure            440832 non-null  int64  
 1   total_spend       440832 non-null  float64
 2   usage_frequency   440832 non-null  int64  
 3   last_interaction  440832 non-null  int64  
 4   support_calls     440832 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 16.8 MB


In [5]:
clv_df.describe()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls
count,440832.0,440832.0,440832.0,440832.0,440832.0
mean,31.256336,631.616223,15.807494,14.480868,3.604437
std,17.255727,240.803001,8.586242,8.596208,3.070218
min,1.0,100.0,1.0,1.0,0.0
25%,16.0,480.0,9.0,7.0,1.0
50%,32.0,661.0,16.0,14.0,3.0
75%,46.0,830.0,23.0,22.0,6.0
max,60.0,1000.0,30.0,30.0,10.0


In [6]:
clv_df.columns

Index(['tenure', 'total_spend', 'usage_frequency', 'last_interaction',
       'support_calls'],
      dtype='object')

### Normalize Features 

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_cols = ['tenure', 'total_spend', 'usage_frequency', 'last_interaction', 'support_calls']

In [10]:
scaled_df = clv_df.copy()
scaled_df[scaled_cols] = scaler.fit_transform(clv_df[scaled_cols])

In [11]:
scaled_df.head()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls
0,0.644068,0.924444,0.448276,0.551724,0.5
1,0.813559,0.507778,0.0,0.172414,1.0
2,0.220339,0.094444,0.103448,0.068966,0.6
3,0.627119,0.328889,0.689655,0.965517,0.7
4,0.525424,0.574444,0.655172,0.655172,0.5


### Defining the clv score formula 

In [12]:
scaled_df['clv_score'] = (
    0.35 * scaled_df['total_spend'] +
    0.30 * scaled_df['tenure'] + 
    0.20 * scaled_df['usage_frequency'] + 
    0.15 * (1 - scaled_df['last_interaction'])
)

In [14]:
scaled_df.head()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls,clv_score
0,0.644068,0.924444,0.448276,0.551724,0.5,0.673672
1,0.813559,0.507778,0.0,0.172414,1.0,0.545928
2,0.220339,0.094444,0.103448,0.068966,0.6,0.259502
3,0.627119,0.328889,0.689655,0.965517,0.7,0.44635
4,0.525424,0.574444,0.655172,0.655172,0.5,0.541441


### Applying penality for support calls 

In [15]:
scaled_df['clv_score'] = scaled_df['clv_score'] - (0.10 * scaled_df['support_calls'])

### Creating clv labels 
- quantile-based binning

In [17]:
scaled_df['clv_label'] = pd.qcut(scaled_df['clv_score'], q=3, labels= ['Low', 'Medium', 'High'])

In [18]:
scaled_df.head()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls,clv_score,clv_label
0,0.644068,0.924444,0.448276,0.551724,0.5,0.623672,High
1,0.813559,0.507778,0.0,0.172414,1.0,0.445928,Medium
2,0.220339,0.094444,0.103448,0.068966,0.6,0.199502,Low
3,0.627119,0.328889,0.689655,0.965517,0.7,0.37635,Low
4,0.525424,0.574444,0.655172,0.655172,0.5,0.491441,Medium


### Final clv dataset 

In [19]:
final_clv_df = clv_df.copy()
final_clv_df['clv_label'] = scaled_df['clv_label']

final_clv_df.head()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls,clv_label
0,39,932.0,14,17,5,High
1,49,557.0,1,6,10,Medium
2,14,185.0,4,3,6,Low
3,38,396.0,21,29,7,Low
4,32,617.0,20,20,5,Medium


In [20]:
final_clv_df.to_csv('clv_final_dataset.csv', index= False)
print('CLV final dataset saved sucessfully !!!')

CLV final dataset saved sucessfully !!!
