<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [1]</a>'.</span>

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [1]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from .logger import Logger
from datetime import datetime
import os
import seaborn as sns

ImportError: attempted relative import with no known parent package

In [None]:
logger = Logger('../data_transformation/data_transformation_log.log')
today = datetime.today().strftime("%Y\\%m\\%d")

In [None]:
cleanedDataset_path = '..\\5. Data Preparation'
csvName = 'cleaned_churn_dataset.csv'
sources = ['HuggingFace', 'Kaggle']

df_hf = pd.read_csv(os.path.join(cleanedDataset_path, sources[0], today, csvName))
df_kg = pd.read_csv(os.path.join(cleanedDataset_path, sources[1], today, csvName))

print(df_hf.shape, df_kg.shape)

master_df = pd.concat([df_hf, df_kg], ignore_index=True)
os.makedirs(os.path.join('master csv', today), exist_ok=True)
path = os.path.join('master csv', today, "cleaned_churn_dataset_master.csv")
master_df.to_csv(path, index=False)
logger.log(f'created master csv from all the date sources at {path}')

In [None]:
df = master_df

In [None]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Fill NaN with 0 (valid since tenure=0 means no charges yet)
df["TotalCharges"] = df["TotalCharges"].fillna(0)

In [None]:
multiCategories = [
    "InternetService",
    "MultipleLines",
    "OnlineSecurity",
    "OnlineBackup",
    "PaperlessBilling",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaymentMethod"]

In [None]:
# Drop customerID as it is not useful
df = df.drop(columns=["customerID"])

# convert gender to 0's and 1's
df["gender"] = df["gender"].map({"Female": 0, "Male": 1})
df = df.infer_objects(copy=False) 

# Convert binary categorical variables to numeric (Yes=1, No=0)
binary_cols = ["Partner", "Dependents", "PhoneService", "Churn", "PaperlessBilling" ]
for col in binary_cols:
    df[col] = df[col].replace({"Yes": 1, "No": 0})
    df = df.infer_objects(copy=False) 

# One-hot encode multi-category variables
df = pd.get_dummies(df, columns=multiCategories, drop_first=False)

bool_cols = df.select_dtypes(include=bool).columns
df[bool_cols] = df[bool_cols].astype(int)

# Compute correlation matrix
correlation_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

logger.log('data transformation completed, data is ready to use for training')


In [None]:
# Get correlation values
correlation_with_churn = df.corr()["Churn"].sort_values(ascending=False)

# Print the correlation values
print(correlation_with_churn)

In [None]:
df["customerID"] = master_df["customerID"]
df.to_csv(os.path.join('master csv', today, "prepared_data.csv"), index=False)