In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("data/processed/most_crowded.csv")

## 1. Feature Extraction

1.1 Binning<br>
1.2. Lagged Features<br>
1.3. Rolling Mean Features<br>


##### Binning


In [3]:
mean_crowd = df["crowd"].mean()
std_crowd = df["crowd"].std()

# Define bins based on mean and standard deviation
bins = [
    float("-inf"),
    mean_crowd - 1.0 * std_crowd,
    mean_crowd - 0.5 * std_crowd,
    mean_crowd + 0.5 * std_crowd,
    mean_crowd + 1.0 * std_crowd,
    float("inf"),
]

# Define labels for the bins
labels = list(range(len(bins) - 1))

In [4]:
for i in range(len(bins) - 1):
    print(
        f"No. of values in bin {i + 1}: {df['crowd'].between(bins[i], bins[i + 1]).sum()}"
    )

No. of values in bin 1: 124
No. of values in bin 2: 178
No. of values in bin 3: 362
No. of values in bin 4: 120
No. of values in bin 5: 135


In [5]:
df["c_lvl"] = pd.cut(
    df["crowd"],
    bins=bins,
    labels=labels,
    include_lowest=True,
    ordered=True,
)

In [6]:
df["hour"] = pd.to_datetime(df["timestamp"], unit="s").dt.hour

#### Lagged and Rolling Mean Features


In [7]:
lagged_df = pd.concat(
    [
        df["timestamp"],
        df["hour"],
        df["crowd"],
        df["c_lvl"],
        *[df["c_lvl"].shift(i).rename(f"lag{i}") for i in range(1, 6)],
        df["c_lvl"].shift(10).rename("lag_10"),
        df["crowd"].rolling(5).mean().rename("crowd_mean_5"),
        df["crowd"].rolling(10).mean().rename("crowd_mean_10"),
        df["crowd"].rolling(30).mean().rename("crowd_mean_30"),
    ],
    axis="columns",
)

In [8]:
lagged_df = lagged_df.set_index("timestamp").dropna()
lagged_df

Unnamed: 0_level_0,hour,crowd,c_lvl,lag1,lag2,lag3,lag4,lag5,lag_10,crowd_mean_5,crowd_mean_10,crowd_mean_30
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1.713177e+09,10,197,2,3,4,4,4,3,4,221.2,211.8,199.933333
1.713177e+09,10,196,2,2,3,4,4,4,4,217.0,209.9,198.233333
1.713177e+09,10,189,2,2,2,3,4,4,2,207.8,210.9,197.000000
1.713177e+09,10,206,3,2,2,2,3,4,3,199.8,211.0,196.833333
1.713177e+09,10,205,3,3,2,2,2,3,3,198.6,210.6,197.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1.713257e+09,8,226,4,4,3,4,4,4,4,218.6,218.8,219.166667
1.713257e+09,8,260,4,4,4,3,4,4,4,227.8,220.9,221.233333
1.713257e+09,8,251,4,4,4,4,3,4,4,234.8,223.1,222.600000
1.713257e+09,8,233,4,4,4,4,4,3,4,241.0,225.0,223.266667


## 4. Model Building

1. Time Series Cross Validation
2. Pipeline Construction


In [9]:
X = lagged_df.drop(columns=["c_lvl"])
y = lagged_df["c_lvl"]

In [10]:
cat_features = X.select_dtypes(include="category").columns.tolist()
num_features = X.select_dtypes(include="int").columns.tolist()

### Under Construction

1. Pipeline Construction
2. Time Series Cross Validation


In [11]:
feature_cats = [sorted(X[feature].unique().tolist()) for feature in cat_features]
target_cats = sorted(y.unique().tolist())

In [12]:
preprocessor = ColumnTransformer(
    [
        ("num", MinMaxScaler(), num_features),
    ],
    remainder="passthrough",
    verbose=False,
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "estimator",
            XGBClassifier(
                objective="multi:softmax",
                tree_method="hist",
                max_depth=2,
                n_estimators=50,
                learning_rate=0.01,
                reg_lambda=0.5,
                n_jobs=-1,
            ),
        ),
    ],
    verbose=False,
)

pipeline

In [13]:
param_grid = {
    "estimator__n_estimators": [50, 100, 200],
    "estimator__max_depth": [2, 3, 5],
    "estimator__learning_rate": [0.01, 0.05, 0.1],
    "estimator__subsample": [0.5, 0.7, 0.8],
    "estimator__colsample_bytree": [0.5, 0.7, 0.8],
}

In [14]:
ts_cv = TimeSeriesSplit(n_splits=5)

##### Grid Search


In [21]:
scoring = {
    "Accuracy": make_scorer(accuracy_score),
    "F1_micro": make_scorer(f1_score, average="micro"),
}

In [16]:
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=ts_cv,
    scoring=scoring,
    refit="f1_micro",
    n_jobs=1,
    verbose=1,
)
grid_search.fit(X, y)

model = grid_search.best_estimator_

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [17]:
print("Best parameters:", grid_search.best_params_)
print("Best CV scores:")
results = grid_search.cv_results_
for scorer in scoring:
    print(f"{scorer}: {np.mean(results[f'mean_test_{scorer}'])}")

Best parameters: {'estimator__colsample_bytree': 0.5, 'estimator__learning_rate': 0.01, 'estimator__max_depth': 2, 'estimator__n_estimators': 100, 'estimator__subsample': 0.5}
Best CV scores:
accuracy: 0.9945834723612501
f1_micro: 0.9945834723612501


In [19]:
cv_results = cross_validate(
    pipeline,
    X,
    y,
    cv=ts_cv,
    scoring=scoring,
    n_jobs=1,
)

In [26]:
for key, value in grid_search.cv_results_.items():
    if key.startswith("mean_test_"):
        metric = key.split("mean_test_")[1]
        print(f"{metric.title()}: {value.mean() * 100:.2f}%")

Accuracy: 99.46%
F1_Micro: 99.46%
