In [None]:

%pip -q uninstall -y pycaret sktime tsfresh plotnine arviz pytensor umap-learn numba || true



In [None]:
%pip -q install --upgrade pip wheel
%pip -q install flaml==2.3.2 xgboost==2.1.1 lightgbm==4.5.0 catboost==1.2.7
%pip -q install mlxtend==0.23.0 scikit-learn==1.5.2 pandas==2.2.2 statsmodels==0.14.3 pmdarima==2.0.4
import pandas as pd, sklearn, xgboost
print("Ready ✅  pandas", pd.__version__, "| sklearn", sklearn.__version__)


Ready ✅  pandas 2.2.2 | sklearn 1.5.2


A1) Classification — Binary (Telco Churn) with FLAML (AutoML)

In [None]:
# Data
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
df['Churn'] = (df['Churn'].str.strip().map({'Yes':1,'No':0})).astype(int)

# Split
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Churn'])
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# AutoML
from flaml import AutoML
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="classification",
    time_budget=120,          # seconds
    metric="roc_auc",
    estimator_list=["xgboost","lgbm","catboost","extra_tree","rf"],
    eval_method="cv",
    n_splits=5
)

print("Best model:", automl.best_estimator, "| ROC_AUC:", automl.best_loss)
from sklearn.metrics import roc_auc_score, classification_report
pred_proba = automl.predict_proba(X_test)[:,1]
pred = automl.predict(X_test)
print("Test ROC_AUC:", roc_auc_score(y_test, pred_proba))
print(classification_report(y_test, pred))

# Save
import joblib, os
os.makedirs("models", exist_ok=True)
joblib.dump(automl, "models/telco_churn_automl.pkl")


[flaml.automl.logger: 10-27 07:58:31] {1728} INFO - task = classification
[flaml.automl.logger: 10-27 07:58:31] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-27 07:58:31] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 10-27 07:58:31] {1955} INFO - List of ML learners in AutoML Run: ['xgboost', 'lgbm', 'catboost', 'extra_tree', 'rf']
[flaml.automl.logger: 10-27 07:58:31] {2258} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 10-27 07:58:31] {2393} INFO - Estimated sufficient time budget=7435s. Estimated necessary time budget=10s.
[flaml.automl.logger: 10-27 07:58:31] {2442} INFO -  at 0.9s,	estimator xgboost's best error=0.1892,	best estimator xgboost's best error=0.1892
[flaml.automl.logger: 10-27 07:58:31] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-27 07:58:33] {2442} INFO -  at 2.1s,	estimator lgbm's best error=0.1877,	best estimator lgbm's best error=0.1877
[flaml.automl.logger: 10-27 07:58:33] {2

['models/telco_churn_automl.pkl']

A2) Classification — Multiclass (Dry Beans)

In [None]:
import pandas as pd, numpy as np

urls = [
    "https://cmustatistics.github.io/data-repository/data/dry-beans.csv",   # CMU mirror
    "https://raw.githubusercontent.com/Filiplindgren/Dry-Bean-Dataset/main/Dry_Bean_Dataset.csv",  # GitHub mirror
]

last_err = None
for u in urls:
    try:
        df = pd.read_csv(u)
        print("Loaded:", u, "shape:", df.shape)
        break
    except Exception as e:
        last_err = e
else:
    raise RuntimeError(f"Could not load dataset from mirrors. Last error: {last_err}")

# Normalize column names to match our script expectations
rename_map = {
    "AspectRation":"AspectRation",  # CMU already uses this spelling
    "roundness":"Roundness",        # some mirrors use lowercase
    "EquivDiameter":"EquivDiameter",
    "MajorAxisLength":"MajorAxisLength",
    "MinorAxisLength":"MinorAxisLength",
}
df = df.rename(columns=rename_map)

# Ensure we have the exact columns we planned to use
cols = ["Area","Perimeter","MajorAxisLength","MinorAxisLength","AspectRation","Eccentricity",
        "ConvexArea","EquivDiameter","Extent","Solidity","Roundness","Compactness",
        "ShapeFactor1","ShapeFactor2","ShapeFactor3","ShapeFactor4","Class"]
missing = [c for c in cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}. Got: {sorted(df.columns.tolist())}")

# --- split + train (same as before) ---
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from flaml import AutoML
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="classification",
    time_budget=120,
    metric="accuracy",
    estimator_list=["lgbm","xgboost","rf","extra_tree"],
    eval_method="cv",
    n_splits=5
)

from sklearn.metrics import accuracy_score
pred = automl.predict(X_test)
print("Best:", automl.best_estimator)
print("Accuracy:", accuracy_score(y_test, pred))


Loaded: https://cmustatistics.github.io/data-repository/data/dry-beans.csv shape: (13611, 17)
[flaml.automl.logger: 10-27 08:02:21] {1728} INFO - task = classification
[flaml.automl.logger: 10-27 08:02:21] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-27 08:02:21] {1838} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 10-27 08:02:21] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost', 'rf', 'extra_tree']
[flaml.automl.logger: 10-27 08:02:21] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-27 08:02:22] {2393} INFO - Estimated sufficient time budget=7661s. Estimated necessary time budget=8s.
[flaml.automl.logger: 10-27 08:02:22] {2442} INFO -  at 0.8s,	estimator lgbm's best error=0.1124,	best estimator lgbm's best error=0.1124
[flaml.automl.logger: 10-27 08:02:22] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-27 08:02:22] {2442} INFO -  at 1.6s,	estimator lgbm's best error=0.1124,	best es

A3) Regression — Bike Sharing (day.csv)

In [None]:
import pandas as pd

# Robust mirror list (RAW links)
mirrors = [
    "https://raw.githubusercontent.com/akumarss/UCI-bike-sharing/master/UCI_BikeSharing_day.csv",  # UCI day.csv
    "https://raw.githubusercontent.com/danwild/bike-share-prediction/master/Bike-Sharing-Dataset/day.csv",
    "https://raw.githubusercontent.com/cloudxlab/ml/master/machine_learning/datasets/bike_sharing/day.csv",
]

last_err = None
for url in mirrors:
    try:
        df = pd.read_csv(url)
        print("Loaded:", url, "shape:", df.shape)
        break
    except Exception as e:
        last_err = e
else:
    raise RuntimeError(f"All mirrors failed. Last error: {last_err}")

# Standard columns for the day file:
# ['instant','dteday','season','yr','mnth','holiday','weekday','workingday',
#  'weathersit','temp','atemp','hum','windspeed','casual','registered','cnt']

# --- Train/test split & FLAML regression (as before) ---
from sklearn.model_selection import train_test_split
X = df.drop(columns=['cnt','instant','dteday'], errors='ignore')
y = df['cnt']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

from flaml import AutoML
automl = AutoML()
automl.fit(
    X_train, y_train,
    task="regression",
    time_budget=120,
    metric="r2",
    estimator_list=["xgboost","lgbm","rf","extra_tree"],
    eval_method="cv",
    n_splits=5
)

from sklearn.metrics import r2_score, mean_absolute_error
pred = automl.predict(X_test)
print("Best:", automl.best_estimator)
print("R2:", r2_score(y_test, pred), "MAE:", mean_absolute_error(y_test, pred))


Loaded: https://raw.githubusercontent.com/danwild/bike-share-prediction/master/Bike-Sharing-Dataset/day.csv shape: (731, 16)
[flaml.automl.logger: 10-27 08:06:05] {1728} INFO - task = regression
[flaml.automl.logger: 10-27 08:06:05] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-27 08:06:05] {1838} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 10-27 08:06:05] {1955} INFO - List of ML learners in AutoML Run: ['xgboost', 'lgbm', 'rf', 'extra_tree']
[flaml.automl.logger: 10-27 08:06:05] {2258} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 10-27 08:06:05] {2393} INFO - Estimated sufficient time budget=1195s. Estimated necessary time budget=1s.
[flaml.automl.logger: 10-27 08:06:05] {2442} INFO -  at 0.1s,	estimator xgboost's best error=0.5426,	best estimator xgboost's best error=0.5426
[flaml.automl.logger: 10-27 08:06:05] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-27 08:06:05] {2442} INFO -  at 0.2s,	estimator lgb

B) Clustering — Mall Customers (KMeans + elbow/silhouette)

In [None]:
import pandas as pd, numpy as np

# --- Try multiple mirrors for Mall_Customers.csv ---
mirrors = [
    "https://raw.githubusercontent.com/vikashmaini/Mall-Customer-Segmentation/master/Mall_Customers.csv",
    "https://raw.githubusercontent.com/sharmaroshan/Mall-Customers-Segmentation/master/Mall_Customers.csv",
    "https://raw.githubusercontent.com/ArinB/MSDS7333_Machine_Learning_1/master/Unit%202/Mall_Customers.csv",
]

df, last_err = None, None
for url in mirrors:
    try:
        tmp = pd.read_csv(url)
        if {"Age","Annual Income (k$)","Spending Score (1-100)"} <= set(tmp.columns):
            df = tmp.copy()
            print("Loaded:", url, "shape:", df.shape)
            break
    except Exception as e:
        last_err = e

# --- Fallback: synthesize a small mall-like dataset if all mirrors fail ---
if df is None:
    print("All mirrors failed; generating a synthetic mall dataset.")
    rng = np.random.default_rng(42)
    n = 200
    df = pd.DataFrame({
        "Gender": rng.choice(["Male","Female"], size=n),
        "Age": rng.integers(18, 65, size=n),
        "Annual Income (k$)": rng.normal(loc=60, scale=20, size=n).clip(10, 150).round(0),
        "Spending Score (1-100)": rng.normal(loc=50, scale=25, size=n).clip(1, 100).round(0),
        "CustomerID": np.arange(1, n+1)
    })
    print("Synthetic shape:", df.shape)

# --- Clustering pipeline (KMeans + silhouette to pick k) ---
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

use_cols = ["Age","Annual Income (k$)","Spending Score (1-100)"]
X = df[use_cols].astype(float).to_numpy()
X = StandardScaler().fit_transform(X)

scores = {}
for k in range(2, 9):
    km = KMeans(n_init=10, random_state=42, n_clusters=k).fit(X)
    scores[k] = silhouette_score(X, km.labels_)
best_k = max(scores, key=scores.get)
print("Silhouette by k:", {k: round(v,4) for k,v in scores.items()}, "=> best_k:", best_k)

kmeans = KMeans(n_init=10, random_state=42, n_clusters=best_k).fit(X)
segmented = df.copy()
segmented["cluster"] = kmeans.labels_
display(segmented.head(10))
print("\nCluster counts:\n", segmented["cluster"].value_counts().sort_index())


All mirrors failed; generating a synthetic mall dataset.
Synthetic shape: (200, 5)
Silhouette by k: {2: 0.2489, 3: 0.242, 4: 0.2632, 5: 0.2596, 6: 0.2491, 7: 0.2529, 8: 0.2555} => best_k: 4


Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100),CustomerID,cluster
0,Male,35,73.0,69.0,1,3
1,Female,60,52.0,56.0,2,2
2,Female,41,60.0,63.0,3,0
3,Male,50,57.0,32.0,4,2
4,Male,39,67.0,46.0,5,1
5,Female,30,88.0,55.0,6,3
6,Male,53,62.0,71.0,7,2
7,Female,63,73.0,40.0,8,2
8,Male,30,19.0,63.0,9,0
9,Male,54,59.0,43.0,10,2



Cluster counts:
 cluster
0    47
1    51
2    61
3    41
Name: count, dtype: int64


C) Anomaly Detection — IsolationForest

In [None]:
import pandas as pd, numpy as np

# --- Try UCI Statlog Shuttle (numeric) ---
urls = {
    "train": "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn",
    "test":  "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst",
}

df = None
try:
    trn = pd.read_csv(urls["train"], header=None, delim_whitespace=True)
    tst = pd.read_csv(urls["test"],  header=None, delim_whitespace=True)
    df = pd.concat([trn, tst], ignore_index=True)
    # Shuttle has 10 columns: 9 numeric features + 1 class label (we'll ignore the label for unsupervised)
    df.columns = [f"x{i}" for i in range(1,10)] + ["label"]
    X = df.drop(columns=["label"]).astype(float)
    source = "UCI Shuttle"
    print(f"Loaded {source} ->", X.shape)
except Exception as e:
    print("UCI load failed, switching to synthetic fallback. Reason:", repr(e))
    # --- Fallback: synthetic numeric data with outliers ---
    rng = np.random.default_rng(42)
    n, d = 2000, 10
    X_normal = rng.normal(0, 1, size=(int(n*0.97), d))
    X_out = rng.normal(6, 0.5, size=(n - X_normal.shape[0], d))  # separated cluster as anomalies
    X = np.vstack([X_normal, X_out])
    np.random.shuffle(X)
    X = pd.DataFrame(X, columns=[f"x{i}" for i in range(1, d+1)])
    source = "Synthetic"
    print(f"Generated {source} ->", X.shape)

# --- IsolationForest pipeline ---
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

X_std = StandardScaler().fit_transform(X)

iso = IsolationForest(
    n_estimators=300,
    max_samples="auto",
    contamination="auto",
    random_state=42,
    n_jobs=-1
)
pred = iso.fit_predict(X_std)   # -1 anomaly, 1 normal
scores = iso.decision_function(X_std)  # the higher, the more normal

out = pd.DataFrame(X.copy())
out["Anomaly"] = (pred == -1).astype(int)
out["Anomaly_Score"] = scores

print(f"\nSource: {source}")
print("Counts ->", out["Anomaly"].value_counts().rename({0:"normal",1:"anomaly"}).to_dict())
display(out.head(10))


  trn = pd.read_csv(urls["train"], header=None, delim_whitespace=True)


UCI load failed, switching to synthetic fallback. Reason: <HTTPError 404: 'Not Found'>
Generated Synthetic -> (2000, 10)

Source: Synthetic
Counts -> {'normal': 1910, 'anomaly': 90}


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,Anomaly,Anomaly_Score
0,-0.08559,-1.457647,2.325912,0.071196,0.499001,-0.540625,-0.652627,1.912685,-1.916722,0.278647,0,0.031066
1,-0.596089,0.008506,0.794932,0.180364,-0.656055,1.226293,1.579186,0.494557,0.973664,1.24196,0,0.081968
2,1.449812,-2.151563,-0.437027,0.004159,0.463058,-1.727315,-1.743444,1.30679,-0.559119,-1.487406,0,0.027521
3,1.162463,0.704648,-0.960996,1.407759,1.229325,0.599585,0.204348,0.226108,-0.078736,0.640785,0,0.097612
4,1.561187,-0.186758,1.307,0.351931,0.47125,-1.335721,0.937523,-0.334932,0.863092,-0.81417,0,0.080884
5,-0.145376,-1.125955,0.789648,-0.417561,-0.421818,0.282659,-0.128908,0.837477,-0.933423,-1.397387,0,0.101354
6,-0.059283,-0.729287,-0.414473,0.63391,0.002993,0.34021,0.670079,-0.374841,0.756248,0.378843,0,0.127711
7,2.076181,0.308512,-0.584073,0.349457,-0.947674,-0.331201,-0.94094,-1.413274,-2.104985,-1.597657,0,0.030144
8,1.178451,0.199238,2.471809,0.170508,-0.290066,0.004133,0.256492,0.435254,2.076711,-0.84532,0,0.058927
9,0.307866,1.309426,-0.524454,0.188676,-0.099094,0.058479,-0.885199,0.575912,0.417964,1.285224,0,0.105088


D) Association Rules — mlxtend (Apriori/FP-growth)

In [None]:
# One-time tiny deps (fast)
%pip -q install mlxtend==0.23.0 pandas==2.2.2

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# --- Try multiple mirrors of Online Retail-style baskets; fallback to synthetic ---
mirrors = [
    "https://raw.githubusercontent.com/saifullahsajid/OnlineRetailII/master/online_retail_short.csv",
    "https://raw.githubusercontent.com/rishabhmisra/Online-Retail/master/data/online_retail_II.csv",
]
df, last_err = None, None
for u in mirrors:
    try:
        tmp = pd.read_csv(u, low_memory=False)
        if {"InvoiceNo","Description"} <= set(tmp.columns):
            df = tmp[["InvoiceNo","Description"]].dropna().drop_duplicates()
            print("Loaded:", u, "rows:", len(df))
            break
    except Exception as e:
        last_err = e

if df is None:
    # --- Synthetic fallback: 2,000 carts with frequent co-purchase patterns ---
    import numpy as np
    rng = np.random.default_rng(42)
    items = ["Tea","Coffee","Sugar","Milk","Cookies","Chips","Soda","Bread","Butter","Jam"]
    carts = []
    for i in range(2000):
        cart = set(rng.choice(items, size=rng.integers(2,6), replace=False))
        # Inject associations
        if "Tea" in cart: cart.add("Cookies")
        if "Bread" in cart: cart.add("Butter")
        carts.append((i, list(cart)))
    df = pd.DataFrame([(cid, it) for cid, lst in carts for it in lst], columns=["InvoiceNo","Description"])
    print("Using synthetic baskets; rows:", len(df))

# --- Make one-hot basket matrix ---
basket = (df.assign(val=1)
            .pivot_table(index="InvoiceNo", columns="Description", values="val", fill_value=0)
            .astype("uint8"))

# --- Frequent itemsets + rules ---
freq = apriori(basket, min_support=0.02, use_colnames=True)
rules = association_rules(freq, metric="confidence", min_threshold=0.3).sort_values("lift", ascending=False)

print("Frequent itemsets:", freq.shape, "| Rules:", rules.shape)
display(rules.head(15))


Using synthetic baskets; rows: 8015
Frequent itemsets: (250, 2) | Rules: (617, 10)


  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
603,"(Butter, Soda, Cookies)","(Bread, Tea)",0.0985,0.11,0.0365,0.370558,3.368713,0.025665,1.413952,0.779979
609,"(Bread, Tea)","(Butter, Soda, Cookies)",0.11,0.0985,0.0365,0.331818,3.368713,0.025665,1.349184,0.790057
604,"(Soda, Cookies, Bread)","(Butter, Tea)",0.0585,0.189,0.0365,0.623932,3.301226,0.025443,2.156523,0.740395
583,"(Butter, Jam, Cookies)","(Bread, Tea)",0.0935,0.11,0.033,0.352941,3.208556,0.022715,1.375455,0.759331
585,"(Bread, Tea)","(Butter, Jam, Cookies)",0.11,0.0935,0.033,0.3,3.208556,0.022715,1.295,0.773408
599,"(Butter, Soda, Tea)","(Bread, Cookies)",0.0595,0.195,0.0365,0.613445,3.145874,0.024897,2.0825,0.725277
571,"(Coffee, Bread, Tea)","(Butter, Cookies)",0.0325,0.32,0.0325,1.0,3.125,0.0221,inf,0.702842
563,"(Bread, Tea, Chips)","(Butter, Cookies)",0.03,0.32,0.03,1.0,3.125,0.0204,inf,0.701031
390,"(Bread, Tea)","(Butter, Cookies)",0.11,0.32,0.11,1.0,3.125,0.0748,inf,0.764045
389,"(Butter, Cookies)","(Bread, Tea)",0.32,0.11,0.11,0.34375,3.125,0.0748,1.35619,1.0


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


E1) Time Series (Univariate) — Airline Passengers (ARIMA)

In [None]:
%pip -q install pmdarima==2.0.4 pandas==2.2.2

import pandas as pd
from pmdarima import auto_arima

# Mirrors + fallback synthetic seasonal series
mirrors = [
    "https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv",
    "https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv?raw=1",
]
ts = None
for u in mirrors:
    try:
        t = pd.read_csv(u, parse_dates=['Month'])
        if {'Month','Passengers'} <= set(t.columns):
            ts = t.rename(columns={'Month':'ds','Passengers':'y'}).set_index('ds')['y'].asfreq('MS')
            print("Loaded:", u, "len:", len(ts))
            break
    except Exception:
        pass

if ts is None:
    import numpy as np
    idx = pd.date_range('1949-01-01', periods=144, freq='MS')
    y = 100 + 0.8*np.arange(len(idx)) + 20*np.sin(2*np.pi*idx.month/12) + np.random.normal(0,5,len(idx))
    ts = pd.Series(y, index=idx, name='y')
    print("Using synthetic airline-like series; len:", len(ts))

m = auto_arima(ts, seasonal=True, m=12, stepwise=True, suppress_warnings=True)
print(m.summary())

# 12-step forecast
fc_idx = pd.date_range(ts.index[-1] + pd.offsets.MonthBegin(), periods=12, freq='MS')
fc = pd.Series(m.predict(n_periods=12), index=fc_idx, name='y_pred')
out = pd.concat([ts.rename('y'), fc]).to_frame()
display(out.tail(24))


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

Loaded: https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv len: 144
                                      SARIMAX Results                                      
Dep. Variable:                                   y   No. Observations:                  144
Model:             SARIMAX(2, 1, 1)x(0, 1, [], 12)   Log Likelihood                -504.923
Date:                             Mon, 27 Oct 2025   AIC                           1017.847
Time:                                     08:18:46   BIC                           1029.348
Sample:                                 01-01-1949   HQIC                          1022.520
                                      - 12-01-1960                                         
Covariance Type:                               opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1 

  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,0
1960-01-01,417.0
1960-02-01,391.0
1960-03-01,419.0
1960-04-01,461.0
1960-05-01,472.0
1960-06-01,535.0
1960-07-01,622.0
1960-08-01,606.0
1960-09-01,508.0
1960-10-01,461.0


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


E2) Time Series (Univariate + Exogenous) — toy retail with promos

In [None]:
%pip -q install pmdarima==2.0.4 pandas==2.2.2

import numpy as np, pandas as pd
from pmdarima import auto_arima

# Generate robust toy retail series with two exogenous signals
rng = pd.date_range('2017-01-01', periods=1000, freq='D')
y = 100 + 0.05*np.arange(len(rng)) + 8*np.sin(2*np.pi*rng.dayofyear/365) + np.random.normal(0,2,len(rng))
promo = (rng.dayofweek>=4).astype(int)          # weekends
holiday = ((rng.month==12) & (rng.day<=31)).astype(int)
df = pd.DataFrame({'y':y, 'promo':promo, 'holiday':holiday}, index=rng)

train, test = df.iloc[:-28], df.iloc[-28:]
m = auto_arima(train['y'], X=train[['promo','holiday']], seasonal=True, m=7, stepwise=True, suppress_warnings=True)
print(m.summary())

pred = m.predict(n_periods=len(test), X=test[['promo','holiday']])
res = pd.DataFrame({"y_true": test['y'], "y_pred": pred}, index=test.index)
display(res.head(10))
print("MAPE ~", (np.abs(res['y_true']-res['y_pred'])/res['y_true']).mean())


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  972
Model:               SARIMAX(1, 1, 2)   Log Likelihood               -2107.534
Date:                Mon, 27 Oct 2025   AIC                           4227.067
Time:                        08:19:57   BIC                           4256.337
Sample:                    01-01-2017   HQIC                          4238.208
                         - 08-30-2019                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
promo          0.0415      0.132      0.315      0.752      -0.216       0.299
holiday        0.1434      0.310      0.462      0.644      -0.465       0.752
ar.L1          0.9927      0.004    252.837      0.0

  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,y_true,y_pred
2019-08-31,141.230453,141.58533
2019-09-01,144.504918,141.546568
2019-09-02,141.739454,141.466619
2019-09-03,140.504524,141.428422
2019-09-04,141.296843,141.390505
2019-09-05,143.467139,141.352866
2019-09-06,141.729985,141.356973
2019-09-07,138.899423,141.319884
2019-09-08,141.405436,141.283067
2019-09-09,141.336866,141.205049


MAPE ~ 0.012158279414912513


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


In [6]:
# Minimal deps (no GPU requirement)
%pip -q install gradio==4.44.0 xgboost==2.1.1 scikit-learn==1.5.2 pandas==2.2.2

import gradio as gr, pandas as pd, numpy as np, xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# ---- Load + prep ----
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
df['Churn'] = (df['Churn'].str.strip().map({'Yes':1,'No':0})).astype(int)

feat_cols = ['Contract','PaymentMethod','MonthlyCharges','TotalCharges']
X_raw = df[feat_cols].copy()
X_raw['TotalCharges'] = pd.to_numeric(X_raw['TotalCharges'], errors='coerce')
X_raw['TotalCharges'] = X_raw['TotalCharges'].fillna(X_raw['TotalCharges'].median())
y = df['Churn']

# one-hot encode categoricals
X = pd.get_dummies(X_raw, columns=['Contract','PaymentMethod'], drop_first=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# ---- Train XGBoost, auto-fallback to CPU ----
common_params = dict(
    n_estimators=300,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    learning_rate=0.08,
    eval_metric="auc",
)

try:
    # Try CUDA-enabled path (some wheels don’t have it)
    clf = xgb.XGBClassifier(**common_params, tree_method="gpu_hist", predictor="gpu_predictor")
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
except Exception as e:
    print("GPU not available in this XGBoost build; falling back to CPU 'hist'.")
    clf = xgb.XGBClassifier(**common_params, tree_method="hist", predictor="auto")
    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
print(f"Test ROC-AUC: {auc:.4f}")

FEATURES = X.columns.tolist()

def preprocess(contract, payment, monthly, total):
    row = pd.DataFrame([{
        "Contract": contract,
        "PaymentMethod": payment,
        "MonthlyCharges": monthly,
        "TotalCharges": total
    }])
    row['TotalCharges'] = pd.to_numeric(row['TotalCharges'], errors='coerce').fillna(X_raw['TotalCharges'].median())
    row = pd.get_dummies(row, columns=['Contract','PaymentMethod'], drop_first=False)
    return row.reindex(columns=FEATURES, fill_value=0)

def predict(contract, payment, monthly, total):
    Xrow = preprocess(contract, payment, monthly, total)
    proba = float(clf.predict_proba(Xrow)[:,1])
    return {"Churn_Probability": round(proba, 4), "Churn_Label": int(proba >= 0.5)}

demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Dropdown(["Month-to-month","One year","Two year"], label="Contract"),
        gr.Dropdown(["Electronic check","Mailed check","Bank transfer (automatic)","Credit card (automatic)"], label="PaymentMethod"),
        gr.Number(label="MonthlyCharges"),
        gr.Number(label="TotalCharges")
    ],
    outputs="json",
    title="Lightweight Churn Predictor (XGBoost)",
    description="Runs with CPU 'hist' if GPU build is unavailable."
)
demo.launch(share=True)


GPU not available in this XGBoost build; falling back to CPU 'hist'.


Parameters: { "predictor" } are not used.

  self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]


Test ROC-AUC: 0.8133
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()


--------


Running on public URL: https://fec0fa603c64d4f6f3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [8]:
# Clean deps (no pmdarima)
%pip -q install gradio==4.44.0 statsmodels==0.14.3 pandas==2.2.2 plotly==5.24.1

import gradio as gr, pandas as pd, numpy as np, plotly.express as px
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings; warnings.filterwarnings("ignore")

# --- Load airline (with fallback synthetic) ---
try:
    t = pd.read_csv(
        "https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv",
        parse_dates=['Month']
    )
    ts = t.rename(columns={'Month':'ds','Passengers':'y'}).set_index('ds')['y'].asfreq('MS')
    print("Loaded airline dataset:", ts.shape)
except Exception as e:
    print("Remote load failed, using synthetic series. Reason:", repr(e))
    idx = pd.date_range('1949-01-01', periods=144, freq='MS')
    ts = pd.Series(100 + 0.8*np.arange(144) + 20*np.sin(2*np.pi*idx.month/12) + np.random.normal(0,5,144),
                   index=idx, name='y')

# --- Fit a seasonal ARIMA (SARIMAX) model ---
# Reasonable default: (p,d,q)=(1,1,1), seasonal (P,D,Q, m)=(1,1,1,12)
model = SARIMAX(ts, order=(1,1,1), seasonal_order=(1,1,1,12),
                enforce_stationarity=False, enforce_invertibility=False)
res = model.fit(disp=False)

def forecast(h):
    """Return a Plotly figure with history + h-step forecast."""
    h = int(h)
    fc_res = res.get_forecast(steps=h)
    fc_idx = pd.date_range(ts.index[-1] + pd.offsets.MonthBegin(), periods=h, freq='MS')
    y_pred = pd.Series(fc_res.predicted_mean.values, index=fc_idx, name='y_pred')

    df = pd.DataFrame({"y": ts}).join(y_pred.to_frame(), how="outer")
    fig = px.line(df.reset_index().rename(columns={'index':'date'}),
                  x='date', y=['y','y_pred'], title=f"{h}-month Forecast")
    return fig

demo = gr.Interface(
    fn=forecast,
    inputs=gr.Slider(6, 36, value=12, step=1, label="Horizon (months)"),
    outputs=gr.Plot(),
    title="Airline Forecast (SARIMAX, statsmodels)",
    description="Seasonal ARIMA (1,1,1)x(1,1,1,12). No pmdarima needed."
)
demo.launch(share=True)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hLoaded airline dataset: (144,)
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://ffc02532d47e642d37.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


