In this notebook, I will compare between difference forcasting models, starting from simple linear to arima models.

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [None]:
from statsforecast import StatsForecast
from statsforecast.models import AutoRegressive

In [None]:
from statsmodels.tsa.ar_model import AutoReg

In [None]:
activities = pd.read_parquet('../data-raw/activities.parquet')

In [None]:
activities

# Naive model

We forecast involves using the previous observations (same date, same time) to predict the next time step (one week for train and w=one for test)

In [None]:
activities_by_time = (
    # extract data just for 2 weeks
    activities[(activities['date'] >= '2023-02-01 00:00:00+00:00') & (activities['date'] < '2023-02-15 00:00:00+00:00')]
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [None]:
activities[(activities['date'] >= '2022-02-01 00:00:00+00:00') & (activities['date'] < '2022-02-15 00:00:00+00:00')]

In [None]:
activities_by_time.head(48)

In [None]:
train_activities = activities_by_time[activities_by_time['datetime'] < '2023-02-08 00:00:00'].reset_index(drop=True)
test_activities = activities_by_time[activities_by_time['datetime'] >= '2023-02-08 00:00:00'].reset_index(drop=True)

In [None]:
len(train_activities), len(test_activities)

In [None]:
test_activities['p_activities'] = train_activities['n_activities']

In [None]:
test_activities.sample(10)

In [None]:
def calculate_metrics(actual, predicted):
    r2 = r2_score(actual, predicted)
    mae = mean_absolute_error(actual, predicted) 
    mse = mean_squared_error(actual, predicted)
    sum_n_activities = actual.sum()
    return pd.Series({'r2': r2, 'mae': mae, 'mse': mse, 'n_activities':sum_n_activities})

# Group by contributor and calculate metrics
result = test_activities.groupby(['contributor', 'category']).apply(lambda x: calculate_metrics(x['n_activities'], x['p_activities'])).reset_index()

In [None]:
result.sort_values(by='r2', ascending=False).head(40)

In [None]:
result.to_csv('../eval/naive_model_metrics.csv', index=False)

# Regression Model

In [None]:
activities_by_time = (
    activities
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [None]:
activities_by_time

In [None]:
#  create lag features
def create_lag_features(group):
    n_previous_times = 168
    for i in range(1, n_previous_times + 1):
        group[f'n_activities_lag_{i}'] = group['n_activities'].shift(i)
    return group.iloc[n_previous_times:].reset_index(drop=True)

In [None]:
# apply the function to each group
laged_activities = activities_by_time.groupby(['contributor', 'category']).apply(create_lag_features).reset_index(drop=True)

In [None]:
laged_activities[['contributor','category','datetime','n_activities','n_activities_lag_1','n_activities_lag_2','n_activities_lag_3']].head(20)

In [None]:
# Function to evaluate the model and return metrics for each contributor
def evaluate_metrics(group_data):
    train_data = group_data[group_data['datetime'] < '2023-04-01 00:00:00'].reset_index(drop=True)
    test_data = group_data[group_data['datetime'] >= '2023-04-01 00:00:00'].reset_index(drop=True)

    # Separate features and target variable
    X_train = train_data.drop(['contributor', 'category', 'datetime', 'n_activities'], axis=1)
    y_train = train_data['n_activities']

    X_test = test_data.drop(['contributor', 'category', 'datetime', 'n_activities'], axis=1)
    y_test = test_data['n_activities']

    # Train the time series forecasting model with multiple linear regression
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)

    # Create a DataFrame for evaluation metrics and sum of activities
    evaluation_metrics = pd.Series({
        'contributor': group_data['contributor'].iloc[0],  # Use the first value since it's the same for the group
        'category': group_data['category'].iloc[0],  # Use the first value since it's the same for the group
        'r2': r2,
        'mae': mae,
        'mse': mse,
        'n_activities': test_data['n_activities'].sum()
    })

    return evaluation_metrics


In [None]:
# Apply the function to each group and concatenate the results
result = laged_activities.groupby(['contributor', 'category']).apply(evaluate_metrics).reset_index(drop=True)

In [None]:
result.sort_values(by='r2', ascending=False).head(40)

In [None]:
result.to_csv('../eval/reg_model_metrics.csv', index=False)

# Autoregression Model

In [None]:
activities_by_time = (
    activities
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [None]:
activities_by_time.head(15)

In [None]:
# Get the data of the top contributor 'sourcegraph-bot, codeclimate[bot]' to test time series decomposition method
temp = (
    activities_by_time.groupby(['contributor', 'category']).get_group(('codeclimate[bot]','bot'))
    .drop(['contributor', 'category'], axis=1)
    .reset_index(drop=True)
    .fillna(0)
)
temp["datetime"] = pd.to_datetime(temp["datetime"])

# Statsforecast specifications
temp["unique_id"]="1"
temp.columns=["ds", "y", "unique_id"]
#temp.tail(10)

temp

In [None]:
train_data = temp[temp['ds'] < '2023-04-01 00:00:00'].reset_index(drop=True)
test_data = temp[temp['ds'] >= '2023-04-01 00:00:00'].reset_index(drop=True)

In [None]:
sf = StatsForecast(df=train_data,
                   models=[AutoRegressive(lags=[168], include_mean=True)],
                   freq='H', 
                   n_jobs=-1)

In [None]:
predictions = sf.forecast(h=len(test_data), level=[95])

In [None]:
predictions

# Autoregression model (statsmodels)

In [106]:
activities_by_time = (
    activities
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [122]:
# Get the data of the top contributor 'sourcegraph-bot' to test time series decomposition method
temp = (
    activities_by_time.groupby(['contributor', 'category']).get_group(('codeclimate[bot]','bot'))
    .drop(['contributor', 'category'], axis=1)
    .reset_index(drop=True)
    .set_index('datetime', drop=True)
    .fillna(0)
)
temp.index.name = None

train = temp.loc[:'2023-03-31 23:00:00']
test = temp.loc['2023-04-01 00:00:00':]

model = AutoReg(train['n_activities'], lags=24*7, seasonal=True, period=24*7)
result = model.fit()

predictions = result.predict(start=len(train), end=len(train) + len(test) - 1)

# Evaluate the model on the test set
actual_values = test['n_activities']
mae = mean_absolute_error(actual_values, predictions)
mse = mean_squared_error(actual_values, predictions)
r2 = r2_score(actual_values, predictions)

# Display the results
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R2: {r2}')

  self._init_dates(dates, freq)


MAE: 6.000939191936709
MSE: 59.76924752042606
R2: 0.36775160785070204


  fcast_index = self._extend_index(index, steps, forecast_index)
  fcast_index = self._extend_index(index, steps, forecast_index)


In [None]:
# train, predict and evaluate
def auro_reg_model(group_data):
    train_data = group_data[group_data['datetime'] < '2023-04-01 00:00:00'].reset_index(drop=True)
    test_data = group_data[group_data['datetime'] >= '2023-04-01 00:00:00'].reset_index(drop=True)

    # Separate features and target variable
    X_train = train_data.drop(['contributor', 'category', 'datetime', 'n_activities'], axis=1)
    y_train = train_data['n_activities']

    X_test = test_data.drop(['contributor', 'category', 'datetime', 'n_activities'], axis=1)
    y_test = test_data['n_activities']

    # Train the time series forecasting model with multiple linear regression
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)

    # Create a DataFrame for evaluation metrics and sum of activities
    evaluation_metrics = pd.Series({
        'contributor': group_data['contributor'].iloc[0],  # Use the first value since it's the same for the group
        'category': group_data['category'].iloc[0],  # Use the first value since it's the same for the group
        'r2': r2,
        'mae': mae,
        'mse': mse,
        'n_activities': test_data['n_activities'].sum()
    })

    return evaluation_metrics

In [None]:
# Apply the function to each group and concatenate the results
result = laged_activities.groupby(['contributor', 'category']).apply(auro_reg_model).reset_index(drop=True)