In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Data Preprocessing:
* Handle missing values (if any) using appropriate techniques.
* Encode categorical features into numerical format.
* Split the dataset into training, validation, and testing sets.
* Consider techniques to handle imbalanced data (if necessary).


In [2]:
df = pd.read_csv('synthetic_customer_data.csv')
df.head()

Unnamed: 0,CustomerID,Age,Gender,ContractType,MonthlyCharges,TotalCharges,TechSupport,InternetService,Tenure,PaperlessBilling,PaymentMethod,Churn,AverageMonthlyCharges,CustomerLifetimeValue
0,1,44,Female,Month-to-month,120.769391,5678.486949,No,Fiber optic,47,No,Mailed check,No,120.818871,5676.161357
1,2,38,Male,Month-to-month,62.208365,4265.407596,No,Fiber optic,69,No,Bank transfer,No,61.817501,4292.377176
2,3,46,Female,Two year,71.061143,1626.640349,Yes,DSL,24,Yes,Credit card,No,67.776681,1705.46742
3,4,55,Female,Month-to-month,37.398676,2639.984014,No,DSL,71,Yes,Mailed check,No,37.182873,2655.305972
4,5,37,Female,Month-to-month,46.861917,1541.051431,Yes,Fiber optic,34,Yes,Electronic check,No,45.325042,1593.305195


In [3]:
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

Missing values per column:
CustomerID               0
Age                      0
Gender                   0
ContractType             0
MonthlyCharges           0
TotalCharges             0
TechSupport              0
InternetService          0
Tenure                   0
PaperlessBilling         0
PaymentMethod            0
Churn                    0
AverageMonthlyCharges    0
CustomerLifetimeValue    0
dtype: int64


In [4]:
df_encoded = pd.get_dummies(df, columns=['Gender', 'ContractType', 'TechSupport', 'InternetService', 
                                          'PaperlessBilling', 'PaymentMethod'], drop_first=True)
df_encoded.head()

Unnamed: 0,CustomerID,Age,MonthlyCharges,TotalCharges,Tenure,Churn,AverageMonthlyCharges,CustomerLifetimeValue,Gender_Male,ContractType_One year,ContractType_Two year,TechSupport_Yes,InternetService_Fiber optic,InternetService_No,PaperlessBilling_Yes,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,44,120.769391,5678.486949,47,No,120.818871,5676.161357,False,False,False,False,True,False,False,False,False,True
1,2,38,62.208365,4265.407596,69,No,61.817501,4292.377176,True,False,False,False,True,False,False,False,False,False
2,3,46,71.061143,1626.640349,24,No,67.776681,1705.46742,False,False,True,True,False,False,True,True,False,False
3,4,55,37.398676,2639.984014,71,No,37.182873,2655.305972,False,False,False,False,False,False,True,False,False,True
4,5,37,46.861917,1541.051431,34,No,45.325042,1593.305195,False,False,False,True,True,False,True,False,True,False


In [5]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [6]:
scaler = StandardScaler()
numerical_features = ['Age', 'MonthlyCharges', 'TotalCharges', 'Tenure', 
                      'AverageMonthlyCharges', 'CustomerLifetimeValue']

df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])

df_encoded.head()

Unnamed: 0,CustomerID,Age,MonthlyCharges,TotalCharges,Tenure,Churn,AverageMonthlyCharges,CustomerLifetimeValue,Gender_Male,ContractType_One year,ContractType_Two year,TechSupport_Yes,InternetService_Fiber optic,InternetService_No,PaperlessBilling_Yes,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0.447589,1.663891,1.599928,0.51505,No,1.691024,1.621125,False,False,False,False,True,False,False,False,False,True
1,2,-0.163788,-0.306977,0.866543,1.574606,No,-0.317108,0.894696,True,False,False,False,True,False,False,False,False,False
2,3,0.651381,-0.009038,-0.502971,-0.592667,No,-0.114285,-0.463324,False,False,True,True,False,False,True,True,False,False
3,4,1.568447,-1.141946,0.022952,1.670929,No,-1.155556,0.035302,False,False,False,False,False,False,True,False,False,True
4,5,-0.265684,-0.823461,-0.547392,-0.111051,No,-0.878434,-0.522204,False,False,False,True,True,False,True,False,True,False


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [8]:
df_encoded['Tenure_MonthlyCharges'] = df_encoded['Tenure'] * df_encoded['MonthlyCharges']

pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [9]:
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

Training set shape: (4836, 17)
Validation set shape: (1612, 17)
Test set shape: (1612, 17)


## Feature Engineering

In [10]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [11]:
print("Tenure column shape:", df_encoded['Tenure'].shape)
print("First few entries in Tenure:", df_encoded['Tenure'].head())

Tenure column shape: (5000,)
First few entries in Tenure: 0    0.515050
1    1.574606
2   -0.592667
3    1.670929
4   -0.111051
Name: Tenure, dtype: float64


In [12]:
if df_encoded['Tenure'].ndim > 1:
    df_encoded['Tenure'] = df_encoded['Tenure'].apply(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)

if df_encoded['MonthlyCharges'].ndim > 1:
    df_encoded['MonthlyCharges'] = df_encoded['MonthlyCharges'].apply(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)

df_encoded['Tenure'] = pd.to_numeric(df_encoded['Tenure'], errors='coerce')
df_encoded['MonthlyCharges'] = pd.to_numeric(df_encoded['MonthlyCharges'], errors='coerce')


In [15]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
poly_features = poly.fit_transform(df_encoded[['Tenure', 'MonthlyCharges', 'TotalCharges']])

poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(['Tenure', 'MonthlyCharges', 'TotalCharges']))
df_encoded = pd.concat([df_encoded, poly_df], axis=1)

In [16]:
df_encoded['Tenure_Bin'] = pd.cut(df_encoded['Tenure'], 
                                  bins=[0, 12, 24, 36, 48, 60], 
                                  labels=['0-12 months', '13-24 months', '25-36 months', '37-48 months', '49-60 months'],
                                  include_lowest=True)

df_encoded['MonthlyCharges_Bin'] = pd.cut(df_encoded['MonthlyCharges'], 
                                          bins=[0, 30, 60, 90, 120], 
                                          labels=['Low', 'Medium', 'High', 'Very High'],
                                          include_lowest=True)

ValueError: Input array must be 1 dimensional

In [None]:
df_encoded['AvgMonthlyCharges'] = df_encoded['TotalCharges'] / df_encoded['Tenure']
df_encoded['TechSupport_Spent'] = np.where(df_encoded['TechSupport_Yes'] == 1, 
                                           df_encoded['MonthlyCharges'] * df_encoded['Tenure'], 0)


In [None]:
df_encoded['Log_TotalCharges'] = np.log1p(df_encoded['TotalCharges'])
df_encoded['Log_MonthlyCharges'] = np.log1p(df_encoded['MonthlyCharges'])

In [None]:
rare_payments = df_encoded['PaymentMethod'].value_counts(normalize=True) < 0.05
df_encoded['PaymentMethod'] = df_encoded['PaymentMethod'].apply(lambda x: 'Other' if rare_payments.loc[x] else x)

In [None]:
contract_churn_rate = df_encoded.groupby('ContractType_One year')['Churn'].mean()
df_encoded['ContractType_One year_Churn'] = df_encoded['ContractType_One year'].map(contract_churn_rate)

In [None]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

In [None]:
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)

X_train_final = X[selected_features]
X_val_final = X[selected_features]
X_test_final = X[selected_features]

print("Final dataset shape:", X_train_final.shape)
print(X_train_final.head())