## Taking the existing columns from customer_churn_dataset-training-master.csv for customer life_time value analysis

In [1]:
import pandas as pd 

In [2]:
clv = pd.read_csv('customer_churn_dataset-training-master.csv')

In [3]:
clv.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [4]:
clv.columns

Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency',
       'Support Calls', 'Payment Delay', 'Subscription Type',
       'Contract Length', 'Total Spend', 'Last Interaction', 'Churn'],
      dtype='object')

### Business-driven feature engineering

In [7]:
columns_for_clv = clv[['Tenure', 'Total Spend', 'Usage Frequency', 'Last Interaction', 'Support Calls']]

In [8]:
columns_for_clv.head()

Unnamed: 0,Tenure,Total Spend,Usage Frequency,Last Interaction,Support Calls
0,39.0,932.0,14.0,17.0,5.0
1,49.0,557.0,1.0,6.0,10.0
2,14.0,185.0,4.0,3.0,6.0
3,38.0,396.0,21.0,29.0,7.0
4,32.0,617.0,20.0,20.0,5.0


### Convert the column names into snake case 

In [17]:
columns_for_clv.columns = columns_for_clv.columns.str.strip().str.lower().str.replace(' ', '_')

In [18]:
columns_for_clv.head()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls
0,39.0,932.0,14.0,17.0,5.0
1,49.0,557.0,1.0,6.0,10.0
2,14.0,185.0,4.0,3.0,6.0
3,38.0,396.0,21.0,29.0,7.0
4,32.0,617.0,20.0,20.0,5.0


In [21]:
columns_for_clv[columns_for_clv.isna().all(axis= 1)]

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls
199295,,,,,


In [22]:
columns_for_clv[columns_for_clv.isna().all(axis= 1)]

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls
199295,,,,,


In [24]:
columns_for_clv = columns_for_clv.dropna(how= 'all')

In [25]:
columns_for_clv.isna().sum()

tenure              0
total_spend         0
usage_frequency     0
last_interaction    0
support_calls       0
dtype: int64

In [26]:
columns_for_clv.isna().sum()

tenure              0
total_spend         0
usage_frequency     0
last_interaction    0
support_calls       0
dtype: int64

### Let's convert float64 to int

In [27]:
int_columns = ['tenure', 'usage_frequency', 'last_interaction', 'support_calls']
columns_for_clv[int_columns] = columns_for_clv[int_columns].round().astype(int)

In [28]:
columns_for_clv.dtypes

tenure                int64
total_spend         float64
usage_frequency       int64
last_interaction      int64
support_calls         int64
dtype: object

In [29]:
# Save to CSV
columns_for_clv.to_csv('clv_cleaned_dataset.csv', index=False)

print("CLV cleaned dataset saved successfully as clv_cleaned_dataset.csv")

CLV cleaned dataset saved successfully as clv_cleaned_dataset.csv


### Let's check the csv

In [30]:
clv2 = pd.read_csv('clv_cleaned_dataset.csv')
clv2.head()

Unnamed: 0,tenure,total_spend,usage_frequency,last_interaction,support_calls
0,39,932.0,14,17,5
1,49,557.0,1,6,10
2,14,185.0,4,3,6
3,38,396.0,21,29,7
4,32,617.0,20,20,5
