<a href="https://colab.research.google.com/github/weixinluo524/AAI2026/blob/main/coding_exercise_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Part 1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1) Load generated dataset (generated from ChatGPT with more than 500+record)
df = pd.read_csv("housing_price_footage_location.csv")

# 2) Rename data columns to match the code
df = df.rename(columns={"footage": "square_footage"})

# Features and target
X = df[['square_footage', 'location']]
y = df['price']
# Preprocessing: One-hot encode the location column
preprocessor = ColumnTransformer(
transformers=[
('location', OneHotEncoder(sparse_output=False), ['location'])
], remainder='passthrough')
# Create pipeline with preprocessing and model
model = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train model
model.fit(X_train, y_train)
# Make prediction for a new house: 2000 sq ft in Downtown
new_house = pd.DataFrame({'square_footage': [2000], 'location': ['Downtown']})
predicted_price = model.predict(new_house)
print(f"Predicted price for a 2000 sq ft house in Downtown: ${predicted_price[0]:,.2f}")
# Display model coefficients
feature_names = (model.named_steps['preprocessor']
.named_transformers_['location']
.get_feature_names_out(['location'])).tolist() + ['square_footage']
coefficients = model.named_steps['regressor'].coef_
print("\nModel Coefficients:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.2f}")

Predicted price for a 2000 sq ft house in Downtown: $452,598.71

Model Coefficients:
location_Downtown: 1402.26
location_Rural: 7875.61
location_Suburb: -9277.87
square_footage: 203.11


In [2]:
#Part 2
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

# Load Telco churn dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# --- Cleaning ---
# TotalCharges sometimes has blanks -> coerce to numeric (blanks become NaN)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Drop identifier
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])

# Target: Churn Yes/No -> 1/0
df["Churn"] = (df["Churn"].astype(str).str.strip().str.lower() == "yes").astype(int)

# Columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c != "Churn"]
cat_cols = [c for c in df.columns if c not in num_cols + ["Churn"]]

# --- Simple imputation ---
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

for c in cat_cols:
    mode_val = df[c].mode(dropna=True)
    df[c] = df[c].fillna(mode_val.iloc[0] if not mode_val.empty else "Unknown")

# Features/target
X = df.drop(columns=["Churn"])
y = df["Churn"]

# --- Preprocessing + model pipeline ---
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=2000, random_state=42)),
    ]
)

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Train ---
model.fit(X_train, y_train)

# --- Evaluate ---
proba = model.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, proba)
cm = confusion_matrix(y_test, pred)
report = classification_report(y_test, pred, digits=3)

print(f"Accuracy: {acc:.3f}")
print(f"ROC AUC:  {auc:.3f}")
print("Confusion matrix [[TN, FP],[FN, TP]]:")
print(cm)
print("\nClassification report:")
print(report)

# --- Predict for a new customer (example: "typical" customer) ---
example = {}
for c in num_cols:
    example[c] = float(df[c].median())
for c in cat_cols:
    mode_val = df[c].mode(dropna=True)
    example[c] = mode_val.iloc[0] if not mode_val.empty else "Unknown"

new_customer = pd.DataFrame([example])

churn_probability = float(model.predict_proba(new_customer)[0, 1])
churn_prediction = int(churn_probability >= 0.5)

print("\nNew customer (example) churn probability:", round(churn_probability, 3))
print("New customer churn prediction (1=churn, 0=no churn):", churn_prediction)

# --- Coefficients with feature names ---
ohe = model.named_steps["preprocessor"].named_transformers_["cat"]
cat_feature_names = ohe.get_feature_names_out(cat_cols).tolist()
feature_names = num_cols + cat_feature_names

coefficients = model.named_steps["classifier"].coef_[0]
coef_df = pd.DataFrame({"feature": feature_names, "coef": coefficients})
coef_df["abs_coef"] = coef_df["coef"].abs()

print("\nTop 10 positive coefficients (more churn):")
print(coef_df.sort_values("coef", ascending=False).head(10)[["feature", "coef"]])

print("\nTop 10 negative coefficients (less churn):")
print(coef_df.sort_values("coef", ascending=True).head(10)[["feature", "coef"]])


Accuracy: 0.806
ROC AUC:  0.842
Confusion matrix [[TN, FP],[FN, TP]]:
[[926 109]
 [165 209]]

Classification report:
              precision    recall  f1-score   support

           0      0.849     0.895     0.871      1035
           1      0.657     0.559     0.604       374

    accuracy                          0.806      1409
   macro avg      0.753     0.727     0.738      1409
weighted avg      0.798     0.806     0.800      1409


New customer (example) churn probability: 0.439
New customer churn prediction (1=churn, 0=no churn): 0

Top 10 positive coefficients (more churn):
                           feature      coef
16     InternetService_Fiber optic  0.640185
36         Contract_Month-to-month  0.579853
3                     TotalCharges  0.516280
35             StreamingMovies_Yes  0.204145
32                 StreamingTV_Yes  0.203591
43  PaymentMethod_Electronic check  0.198382
18               OnlineSecurity_No  0.156831
27                  TechSupport_No  0.132073
14 

In [None]:
#Part3
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load dataset. I used the same Dataset from Part 2
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Clean TotalCharges column (it contains blanks)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df = df.dropna(subset=["TotalCharges"])

# Select numerical features for clustering
features = ["tenure", "MonthlyCharges", "TotalCharges"]

X = df[features]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow Method (K = 1 to 5)
inertia = []
K = range(1, 6)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(8, 5))
plt.plot(list(K), inertia, marker="o")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal K")
plt.savefig("elbow_plot.png")
plt.close()

# Apply K-Means with K=3
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(X_scaled)

# Analyze cluster characteristics
cluster_summary = df.groupby("cluster")[features].mean().round(2)

print("Cluster Characteristics:")
print(cluster_summary)

# Example targeted strategies
for cluster in range(optimal_k):
    print(f"\nCluster {cluster} Strategy:")

    if cluster_summary.loc[cluster, "MonthlyCharges"] > 70:
        print("High monthly charges customers: Offer loyalty discounts.")
    elif cluster_summary.loc[cluster, "tenure"] > 40:
        print("Long-tenure customers: Provide VIP retention benefits.")
    else:
        print("New or low-value customers: Offer onboarding incentives.")

# Save cluster assignments
df.to_csv("customer_segments.csv", index=False)


Cluster Characteristics:
         tenure  MonthlyCharges  TotalCharges
cluster                                      
0         29.62           26.63        815.26
1         58.57           89.74       5249.43
2         13.26           74.97       1031.88

Cluster 0 Strategy:
New or low-value customers: Offer onboarding incentives.

Cluster 1 Strategy:
High monthly charges customers: Offer loyalty discounts.

Cluster 2 Strategy:
High monthly charges customers: Offer loyalty discounts.
