In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("customer-churn.csv")

# Lihat 5 data teratas
df.head()

# Info dataset
print(df.info())

# Cek missing values
print(df.isnull().sum())

# Ada kolom TotalCharges yang seharusnya numeric tapi object
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')

# Isi missing values dengan median
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Distribusi target
sns.countplot(data=df, x='Churn')
plt.title("Distribusi Churn")
plt.show()

# Persentase churn
print(df['Churn'].value_counts(normalize=True) * 100)

# Korelasi numerik
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.show()

# Drop kolom ID
df.drop("customerID", axis=1, inplace=True)

# Encoding categorical features
label_enc = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = label_enc.fit_transform(df[col])

# Scaling fitur numerik
scaler = StandardScaler()
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Prediksi
y_pred = model.predict(X_test)

# Evaluasi
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))