In [None]:
# Random Forest Reggression

In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score


In [10]:
# show Dataset

In [11]:
FILE_PATH = "./Turn_ratio.xlsx"

In [12]:
# Some important data related to PCU

In [13]:
PCU_FACTORS = {
"L": 1.5, # Left turn
"T": 1.0, # Through
"R": 1.0 # Right turn
}


SAT_PER_LANE = 1800 # saturation flow in PCU/hr/lane
DEFAULT_LANES = 2 # assumed lanes per approach
DEFAULT_LOST_TIME = 12 # seconds lost per cycle (yellow + all-red + startup)

In [7]:
# dataset 

In [8]:
pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
data = pd.read_excel(FILE_PATH)

In [15]:
rename_map = {
    "Northbound Left": "NBL",
    "Northbound Thru": "NBT",
    "Northbound Right": "NBR",
    "Southbound Left": "SBL",
    "Southbound Thru": "SBT",
    "Southbound Right": "SBR",
    "Eastbound Left": "EBL",
    "Eastbound Thru": "EBT",
    "Eastbound Right": "EBR",
    "Westbound Left": "WBL",
    "Westbound Thru": "WBT",
    "Westbound Right": "WBR",
}

data = data.rename(columns=rename_map)


In [16]:
def compute_pcus(row):
    pcu_N = row["NB_L"] + row["NB_T"] + row["NB_R"]
    pcu_S = row["SB_L"] + row["SB_T"] + row["SB_R"]
    pcu_E = row["EB_L"] + row["EB_T"] + row["EB_R"]
    pcu_W = row["WB_L"] + row["WB_T"] + row["WB_R"]

    return pd.Series({
        "pcu_N": pcu_N,
        "pcu_S": pcu_S,
        "pcu_E": pcu_E,
        "pcu_W": pcu_W,
        "pcu_NS": pcu_N + pcu_S,  # North–South combined
        "pcu_EW": pcu_E + pcu_W   # East–West combined
    })

pcu_data = data.apply(compute_pcus, axis=1)
data = pd.concat([data, pcu_data], axis=1)


In [17]:
print(data.columns.tolist())

['Int#', 'NB_L', 'NB_T', 'NB_R', 'SB_L', 'SB_T', 'SB_R', 'EB_L', 'EB_T', 'EB_R', 'WB_L', 'WB_T', 'WB_R', 'pcu_N', 'pcu_S', 'pcu_E', 'pcu_W', 'pcu_NS', 'pcu_EW']


In [18]:
# websters formula

In [19]:
print(data.columns.tolist())

['Int#', 'NB_L', 'NB_T', 'NB_R', 'SB_L', 'SB_T', 'SB_R', 'EB_L', 'EB_T', 'EB_R', 'WB_L', 'WB_T', 'WB_R', 'pcu_N', 'pcu_S', 'pcu_E', 'pcu_W', 'pcu_NS', 'pcu_EW']


In [20]:
def webster_cycle_and_splits(row):
    y_NS = row["pcu_NS"] / (DEFAULT_LANES * SAT_PER_LANE)
    y_EW = row["pcu_EW"] / (DEFAULT_LANES * SAT_PER_LANE)
    Y = y_NS + y_EW

    if Y >= 1:  # oversaturated, formula invalid
        return pd.Series({
            "cycle_length": None,
            "green_NS": None,
            "green_EW": None
        })

    # Webster’s optimal cycle length
    C = (1.5 * DEFAULT_LOST_TIME + 5) / (1 - Y)

    # Green time splits proportional to flow ratios
    green_NS = (y_NS / Y) * (C - DEFAULT_LOST_TIME)
    green_EW = (y_EW / Y) * (C - DEFAULT_LOST_TIME)

    return pd.Series({
        "cycle_length": C,
        "green_NS": green_NS,
        "green_EW": green_EW
    })

webster_results = data.apply(webster_cycle_and_splits, axis=1)
data = pd.concat([data, webster_results], axis=1)

# Drop invalid rows (Y >= 1)
data = data.dropna()

print("\nSample results with Webster timings:")
print(data[["pcu_NS", "pcu_EW", "cycle_length", "green_NS", "green_EW"]].head())



Sample results with Webster timings:
   pcu_NS  pcu_EW  cycle_length   green_NS   green_EW
0     401     601     31.870670   7.952234  11.918436
1     195     998     34.399668   3.661304  18.738364
2     941     914     47.449857  17.982919  17.466937
3     291    1365     42.592593   5.375872  25.216720
4    1652    1206    111.590296  57.565840  42.024457


In [21]:
# training the model

In [22]:
X = data[["pcu_NS", "pcu_EW"]]   # Features
y = data["cycle_length"]         # Target = Webster's cycle length

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [23]:
# predict the optimised timing

In [24]:
preds = model.predict(X_test)


# Evaluate
mse = mean_squared_error(y_test, preds)
print("Random Forest MSE:", mse)

print("ML Model Performance:")
print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


Random Forest MSE: 5195.2019240198615
ML Model Performance:
R2 Score: 0.5610966943457762
RMSE: 72.07774916033284


In [25]:
# testing on some example

In [28]:
example = pd.DataFrame({"pcu_NS": [900], "pcu_EW": [700]})
print("\nPredicted cycle length for new input:")
print(model.predict(example))


Predicted cycle length for new input:
[25.61416683]
