In [1]:
import numpy as np
import pandas as pd

In [2]:
np.__version__

'1.24.3'

In [3]:
n = 4000
np.random.seed(1234)

age = np.random.randint(low=20, high=80, size=n)
monthly_charges = np.random.randint(low=0, high=3000, size=n)
tenure = (np.random.randint(low=0, high=40, size=n) + age / 2 - 9).astype('int')
n_monthly_visits = np.random.randint(low=0, high=10, size=n)
monthly_products = np.maximum(1, (monthly_charges + np.random.randn(n) * 500) // 250)
total_charges = np.round(monthly_charges * (np.random.rand(n) + 0.5) * tenure, 2)
irrelevant_1 = np.round(np.random.rand(n), 2)
irrelevant_2 = np.round(np.random.randn(n), 2)

In [4]:
def min_median_max(vec):
    print(f'min = {np.min(vec)}, median = {np.median(vec)}, max = {np.max(vec)}')

In [5]:
for var in ['age', 'monthly_charges', 'tenure', 'n_monthly_visits', 'monthly_products', 'total_charges',
            'irrelevant_1', 'irrelevant_2']:
    print(var)
    min_median_max(eval(var))

age
min = 20, median = 50.0, max = 79
monthly_charges
min = 0, median = 1496.0, max = 2998
tenure
min = 1, median = 35.0, max = 69
n_monthly_visits
min = 0, median = 5.0, max = 9
monthly_products
min = 1.0, median = 5.0, max = 17.0
total_charges
min = 0.0, median = 41061.36, max = 279094.67
irrelevant_1
min = 0.0, median = 0.49, max = 1.0
irrelevant_2
min = -3.9, median = -0.04, max = 3.6


In [6]:
churn_prob = ((age - 50) / 30)**2 \
    + ((age - 50) / 30) * ((monthly_charges - 1500) / 1500) * 0.5 \
    + n_monthly_visits * (-0.05) \
    + tenure * 0.005 \
    + np.random.rand(n) / 20
min_median_max(churn_prob)

min = -0.42354339079424425, median = 0.24824395851282655, max = 1.6432398225152607


In [7]:
churn_prob = (churn_prob - churn_prob.min()) / (churn_prob.max() - churn_prob.min())
min_median_max(churn_prob)

min = 0.0, median = 0.32504006466713514, max = 1.0


In [8]:
features = {}
for var in ['age', 'monthly_charges', 'tenure', 'n_monthly_visits', 'monthly_products', 'total_charges',
            'irrelevant_1', 'irrelevant_2']:
    features[var] = eval(var)
X = pd.DataFrame(features)
y = pd.Series(np.where(churn_prob > 0.5, 1, 0))

In [9]:
X

Unnamed: 0,age,monthly_charges,tenure,n_monthly_visits,monthly_products,total_charges,irrelevant_1,irrelevant_2
0,67,2503,63,6,12.0,233761.47,0.45,1.24
1,39,791,22,9,2.0,9799.07,0.33,-1.35
2,58,2499,25,2,11.0,53397.44,0.97,-0.78
3,73,1761,32,1,7.0,59443.79,0.11,-0.39
4,32,1814,12,7,9.0,31474.97,0.22,0.80
...,...,...,...,...,...,...,...,...
3995,41,857,13,1,5.0,8138.96,0.50,-1.35
3996,45,1815,14,1,7.0,35799.04,0.43,-0.38
3997,20,959,9,4,5.0,11551.49,0.00,-0.80
3998,55,1813,46,9,3.0,91540.89,0.63,-1.66


In [10]:
y

0       0
1       0
2       0
3       1
4       0
       ..
3995    0
3996    0
3997    1
3998    0
3999    0
Length: 4000, dtype: int32

In [11]:
y.value_counts()

0    3140
1     860
Name: count, dtype: int64

In [12]:
y.value_counts()[1] / y.shape

array([0.215])

In [13]:
X.to_csv('../datasets/churn_simulated/X.csv', index=False)

In [16]:
y.to_csv('../datasets/churn_simulated/y.csv', index=False, header=False)