In [1]:
import pandas as pd
import numpy as np

## Data Generation:

Generate a synthetic dataset of 5000 customer records containing the following features:

* CustomerID
* Age
* Gender
* ContractType (Month-to-month, One year, Two year)
* MonthlyCharges
* TotalCharges
* TechSupport
* InternetService (DSL, Fiber optic, No)
* Tenure
* PaperlessBilling
* PaymentMethod
* Churn (Yes/No)

1. Introduce realistic distributions, correlations, and outliers to the data.
2. Ensure a target churn rate of approximately 20%.
3. Create derived features like average_monthly_charges, customer_lifetime_value.

In [2]:
np.random.seed(42)
n_records = 5000

In [3]:
customer_ids = np.arange(1, n_records + 1)

ages = np.random.normal(40, 10, n_records).astype(int)
ages = np.clip(ages, 18, 80)

genders = np.random.choice(['Male', 'Female'], n_records)

contract_types = np.random.choice(['Month-to-month', 'One year', 'Two year'], n_records, 
                                  p=[0.6, 0.2, 0.2])

monthly_charges = np.random.normal(70, 30, n_records)
monthly_charges = np.clip(monthly_charges, 20, 150)

tenure = np.random.randint(1, 73, n_records)

total_charges = monthly_charges * tenure
total_charges += np.random.normal(0, 50, n_records)
total_charges = np.clip(total_charges, 20, None) 

tech_support = np.random.choice(['Yes', 'No'], n_records)

internet_service = np.random.choice(['DSL', 'Fiber optic', 'No'], n_records, p=[0.3, 0.5, 0.2])

paperless_billing = np.random.choice(['Yes', 'No'], n_records)

payment_method = np.random.choice(['Credit card', 'Bank transfer', 'Electronic check', 'Mailed check'], 
                                  n_records, p=[0.25, 0.25, 0.25, 0.25])

churn = np.random.choice(['Yes', 'No'], n_records, p=[0.2, 0.8])

average_monthly_charges = total_charges / np.where(tenure > 0, tenure, 1)
customer_lifetime_value = tenure * monthly_charges

In [4]:
df = pd.DataFrame({
    'CustomerID': customer_ids,
    'Age': ages,
    'Gender': genders,
    'ContractType': contract_types,
    'MonthlyCharges': monthly_charges,
    'TotalCharges': total_charges,
    'TechSupport': tech_support,
    'InternetService': internet_service,
    'Tenure': tenure,
    'PaperlessBilling': paperless_billing,
    'PaymentMethod': payment_method,
    'Churn': churn,
    'AverageMonthlyCharges': average_monthly_charges,
    'CustomerLifetimeValue': customer_lifetime_value
})

In [5]:
outlier_indices = np.random.choice(df.index, size=5, replace=False)
df.loc[outlier_indices, 'MonthlyCharges'] *= 5
df.loc[outlier_indices, 'TotalCharges'] *= 5

In [6]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,ContractType,MonthlyCharges,TotalCharges,TechSupport,InternetService,Tenure,PaperlessBilling,PaymentMethod,Churn,AverageMonthlyCharges,CustomerLifetimeValue
0,1,44,Female,Month-to-month,120.769391,5678.486949,No,Fiber optic,47,No,Mailed check,No,120.818871,5676.161357
1,2,38,Male,Month-to-month,62.208365,4265.407596,No,Fiber optic,69,No,Bank transfer,No,61.817501,4292.377176
2,3,46,Female,Two year,71.061143,1626.640349,Yes,DSL,24,Yes,Credit card,No,67.776681,1705.46742
3,4,55,Female,Month-to-month,37.398676,2639.984014,No,DSL,71,Yes,Mailed check,No,37.182873,2655.305972
4,5,37,Female,Month-to-month,46.861917,1541.051431,Yes,Fiber optic,34,Yes,Electronic check,No,45.325042,1593.305195


In [7]:
df.to_csv('synthetic_customer_data.csv', index=False)