In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.preprocessing as skl_pre
import sklearn.linear_model as skl_lm
import sklearn.discriminant_analysis as skl_da
import sklearn.neighbors as skl_nb
import sklearn.model_selection as skl_ms

data = pd.read_csv('training_data_VT2026.csv', dtype={'ID': str}).reset_index(drop=True)

In [4]:
# Define custom features

data["increase_stock"] = data["increase_stock"].map({
    "low_bike_demand": 0,
    "high_bike_demand": 1
})

data["day"] = ((data['hour_of_day'] >= 6) & (data['hour_of_day'] <= 21)).astype(int)

data["snow"] = (data['snowdepth'] > 0).astype(int)

data["good_weather"] = ( 
        (data["humidity"] <= 60).astype(int) + 
        (data["precip"] == 0).astype(int) + 
        (data["windspeed"] <= 20).astype(int) + 
        (data["cloudcover"] <= 50).astype(int) + 
        (data["visibility"] >= 10).astype(int) + 
        data["snow"]
    )

data["dry_warm_index"] = (
        data['temp'] * (100-data["humidity"])
    )

In [6]:
#Split into training and testing data

N = len(data)

M = N//2


# Generate random indices for the test set without replacement
test_indices = np.random.choice(N, size=M, replace=False)

# Get the actual index labels of the DataFrame
all_indices = data.index

# Create a boolean mask for the test indices. Returns a boolean array of the
# same shape as all_auto_indices that is True where an element of
# all_indices is in all_indices[test_indices] and False otherwise.
test_mask = np.isin(all_indices, all_indices[test_indices])

# Select the train and test dataframes using the boolean mask
test = data[test_mask]
train = data[~test_mask]

In [7]:
# Calculate rush hour and rush hour Gaussian


# Viktigt! mean_time och std_time definierade utifrån träningsdatan
mean_time = train.loc[data["increase_stock"] == 1, "hour_of_day"].mean()
std_time  = train.loc[data["increase_stock"] == 1, "hour_of_day"].std()

train["rush_hour"] = ((train["hour_of_day"] >= (mean_time-(1.45*std_time))) & (train["hour_of_day"] <= (mean_time+(1.45*std_time)))).astype(int)
train['rush_hour_gaussian'] = np.exp(-(train['hour_of_day'] - mean_time)**2 / (2*std_time**2))

test["rush_hour"] = ((test["hour_of_day"] >= (mean_time-(1.45*std_time))) & (data["hour_of_day"] <= (mean_time+(1.45*std_time)))).astype(int)
test['rush_hour_gaussian'] = np.exp(-(test['hour_of_day'] - mean_time)**2 / (2*std_time**2))


# Rush hour Gaussian implementerar rush_hour som numerisk egenskap istället för kategorisk. Blir en mer flytande skala, gav bättre resultat iaf med log-reg.
# Ett alternativt sätt att definiera rush_hour är att definiera den separat för varje dag i veckan. Ger i princip en mer träffsäker modell men riskera roverfitting om för lite data finns tillgänglig för en dag
# Vi får välja vilken vi använder gemensamt genom att typ testa vilken som funkar bäst för alla. Ska bli spännande.
for d in range(7):
    mask = data["day_of_week"] == d
    mean_time_this_day = train.loc[(train["increase_stock"] == 1) & (train['day_of_week'] == d), "hour_of_day"].mean()
    std_time_this_day = train.loc[(train["increase_stock"] == 1) & (train['day_of_week'] == d), "hour_of_day"].std()
    print(f"Rush hour at day {d}: {mean_time_this_day}")

    train.loc[(mask & (train["hour_of_day"] >= (mean_time_this_day-(1.5*std_time_this_day))) & (train["hour_of_day"] <= (mean_time_this_day+(1.5*std_time_this_day)))), 'rush_hour'] = 1
    train.loc[(mask & (train["hour_of_day"] < (mean_time_this_day-(1.5*std_time_this_day))) | (train["hour_of_day"] > (mean_time_this_day+(1.5*std_time_this_day)))), 'rush_hour'] = 0
    train.loc[(mask), 'rush_hour_gaussian'] = np.exp(-(train['hour_of_day'] - mean_time_this_day)**2 / (2*std_time_this_day**2))

    test.loc[(mask & (test["hour_of_day"] >= (mean_time_this_day-(1.5*std_time_this_day))) & (test["hour_of_day"] <= (mean_time_this_day+(1.5*std_time_this_day)))), 'rush_hour'] = 1
    test.loc[(mask & (test["hour_of_day"] < (mean_time_this_day-(1.5*std_time_this_day))) | (test["hour_of_day"] > (mean_time_this_day+(1.5*std_time_this_day)))), 'rush_hour'] = 0
    test.loc[(mask), 'rush_hour_gaussian'] = np.exp(-(test['hour_of_day'] - mean_time_this_day)**2 / (2*std_time_this_day**2))

#data['rush_hour_og'] = ((data["hour_of_day"] >= (mean_time-(1.45*std_time))) & (data["hour_of_day"] <= (mean_time+(1.45*std_time)))).astype(int)
#data['rush_hour_gaussian_og'] = np.exp(-(data['hour_of_day'] - mean_time)**2 / (2*std_time**2))


Rush hour at day 0: 15.384615384615385
Rush hour at day 1: 15.846153846153847
Rush hour at day 2: 15.31578947368421
Rush hour at day 3: 15.11111111111111
Rush hour at day 4: 14.7
Rush hour at day 5: 14.18918918918919
Rush hour at day 6: 14.647058823529411


In [16]:
# Choose which features to use

numerical_features = ['temp', 'humidity', 'precip', 'snowdepth', 'windspeed',
                      'cloudcover', 'visibility', 'good_weather', 'dry_warm_index', 'rush_hour_gaussian']
cat_features = ['hour_of_day', 'day_of_week', 'month', 'holiday',
                'weekday', 'summertime', 'day', 'rush_hour']

corr_numerical = (
    train[numerical_features + ["increase_stock"]]
    .corr()
    ["increase_stock"]
    .drop("increase_stock")
    .sort_values(key=abs, ascending=False)
)

corr_cat = (
    train[cat_features + ["increase_stock"]]
    .corr()
    ["increase_stock"]
    .drop("increase_stock")
    .sort_values(key=abs, ascending=False)
)
numerical_features = [str(corr_numerical.index[i]) for i in range(len(numerical_features))]
cat_features = [str(corr_cat.index[i]) for i in range(len(cat_features))]

print(numerical_features)
print(cat_features)

best_numerical_features = ['dry_warm_index', 'rush_hour_gaussian'] # k=2
best_cat_features = ['day', 'hour_of_day', 'summertime', 'weekday', 'day_of_week', 'month'] # k=6

X_train = train[best_numerical_features + best_cat_features]
y_train = train['increase_stock']

X_test = test[best_numerical_features + best_cat_features]
y_test = test['increase_stock']

['dry_warm_index', 'rush_hour_gaussian', 'temp', 'humidity', 'good_weather', 'windspeed', 'visibility', 'precip', 'snowdepth', 'cloudcover']
['day', 'rush_hour', 'hour_of_day', 'summertime', 'weekday', 'day_of_week', 'holiday', 'month']


In [9]:
# Log-reg

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import FunctionTransformer


num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    #('select', SelectKBest(score_func=f_classif))
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    #('select', SelectKBest(score_func=f_classif))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, best_numerical_features),
        ('cat', cat_pipeline, best_cat_features)
    ]
)

model_lr = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', LogisticRegression(penalty = 'l1', C=1.0, solver = 'liblinear', max_iter=1000)) # l1 om accuracy, l2 om ROC-AUC
])

model_lr.fit(X_train, y_train)

preds = model_lr.predict(X_test)
score = model_lr.score(X_test, y_test)
error = np.mean(preds != y_test)

print("Test accuracy:", score)
print("Test error:", error)


NameError: name 'best_numerical_features' is not defined

In [None]:
# Decision tree

In [None]:
#LDA & QDA

In [None]:
# Boosting