# Libraries and data importing

Importing packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.structural import UnobservedComponents
from statsmodels.tsa.exponential_smoothing.ets import ETSModel

In [3]:
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

Importing data

In [4]:
activities = pd.read_parquet('../data-raw/activities.parquet')
activities

Unnamed: 0,contributor,category,repository,activity,date
0,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:19+00:00
1,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:23+00:00
2,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:26+00:00
3,analysis-bot,bot,facebook/react-native,Commenting pull request,2022-11-25 09:55:27+00:00
4,neos-bot,bot,neos/neos-ui-compiled,Pushing commits,2022-11-25 09:55:47+00:00
...,...,...,...,...,...
1015418,798388,human,879434,Reviewing code,2023-04-15 16:06:15+00:00
1015419,798388,human,879434,Reviewing code,2023-04-15 16:07:26+00:00
1015420,784775,human,643744,Creating branch,2023-04-15 16:07:33+00:00
1015421,784775,human,888378,Opening pull request,2023-04-15 16:08:07+00:00


600 events at max for each contributor

In [5]:
data = ( 
    activities
    # keep the last 600 events for each contributor
    .groupby('contributor')
    .tail(600)
    # keep the contributors who have more than 600 events
    .groupby('contributor')
    .filter(lambda x: len(x) >= 600)
)
data

Unnamed: 0,contributor,category,repository,activity,date
365,nodebb-misty,bot,julianlam/nodebb-plugin-email-helper,Closing pull request,2022-11-25 11:08:46+00:00
409,nodebb-misty,bot,NodeBB/nodebb-plugin-topic-redirect,Closing pull request,2022-11-25 11:20:10+00:00
417,nodebb-misty,bot,NodeBB/nodebb-plugin-write-api,Closing pull request,2022-11-25 11:21:23+00:00
523,nodebb-misty,bot,julianlam/nodebb-plugin-sso-oauth,Closing pull request,2022-11-25 11:47:30+00:00
5518,jenkins-x-bot-test,bot,jenkins-x-charts/jxboot-helmfile-resources,Creating branch,2022-11-26 19:57:10+00:00
...,...,...,...,...,...
1015418,798388,human,879434,Reviewing code,2023-04-15 16:06:15+00:00
1015419,798388,human,879434,Reviewing code,2023-04-15 16:07:26+00:00
1015420,784775,human,643744,Creating branch,2023-04-15 16:07:33+00:00
1015421,784775,human,888378,Opening pull request,2023-04-15 16:08:07+00:00


In [6]:
def gap_activities(train, test):

    start_time = train['date'].iloc[-1] + pd.Timedelta(hours=1)
    end_time = test['date'].iloc[0] - pd.Timedelta(hours=1)

    #  check if there is a time gap between the train and test data
    if end_time - start_time >= pd.Timedelta(hours=0):

        # fill the gap with a date range and zeros for n_activities
        gap_data = pd.DataFrame({
            'category': train['category'].iloc[0],
            'date': pd.date_range(start=start_time, end=end_time, freq='H'),
            'contributor': train['contributor'].iloc[0],
            'n_activities': 0
        })

        test = pd.concat([gap_data, test]).reset_index(drop=True)
    
    return test

In [7]:
def split_activities(contributor):

    # spliting the data into training and testing sets for time series forecasting, using a time-based split with split size = 0.5
    train, test = (
        contributor
        .apply(lambda x: x[:300])
        .groupby(['category', pd.Grouper(key='date', freq='H'), 'contributor'])['activity']
        .count()
        .reset_index(name='n_activities'),

        contributor
        .apply(lambda x: x[300:])
        .groupby(['category', pd.Grouper(key='date', freq='H'), 'contributor'])['activity']
        .count()
        .reset_index(name='n_activities')
    )

    # checking if the last timestamp of the train data is equal to the first timestamp of the second data
    if train['date'].iloc[-1] == test['date'].iloc[0]:
        # adding the value of the last time value (n_activities) of train data to the value of the first time (n_activities) of the test data
        test.loc[0, 'n_activities'] += train.loc[train.index[-1], 'n_activities']
        # removing the last time of the train data
        train.drop(train.index[-1], inplace=True)

    test = gap_activities(train, test)

    # filling n_activities with zeros for the empty hours between the minimum and maximum date
    train, test = (
        # for train set, we take last 3 months
        train[train['date'] >= train['date'].max() - pd.DateOffset(months=3)]
        .set_index('date')
        .resample('H')
        .sum()
        .rename_axis(None)
        .replace({'category': 0, 'contributor': 0}, None)
        .ffill(),
        
        test
        .set_index('date')
        .resample('H')
        .sum()
        .rename_axis(None)
        .replace({'category': 0, 'contributor': 0}, None)
        .ffill()
    )

    train.index.freq = 'H'

    return train, test

# New evaluation metrics PGA & CTD?

A new evaluation metric that calculates the percentage of predicted values greater than or equal to the actual values. We can define this metric as follows:

$$PGA = \frac{\sum_{i=1}^{n} [y_i \leq \hat{y}_i]}{n}$$

In [8]:
def pga_score(y_true, y_pred):
    return (y_pred >= y_true).mean()

A novel evaluation metric designed to quantify the time difference between the cumulative sums of true and predicted values in reaching a specified target value.

$$ \text{CTD} = \text{argmax}(C_t \geq T) - \text{argmax}(C_p \geq T) $$

This formula represents the time difference between the cumulative sums of the true $C_t$ and predicted $C_p$ values in reaching a specified target value $T(100, 200, 300)$, where ${argmax}$ returns the time of the first occurrence where the condition is satisfied.

In [187]:
def ctd_score(y_true, y_pred, target_value):

    coef = 1
    if (sum(y_true) < target_value) | (sum(y_pred) < target_value):
        coef = -1

    true_cumsum, pred_cumsum = np.cumsum(y_true), np.cumsum(y_pred)
    time_true, time_pred = np.argmax(true_cumsum >= target_value), np.argmax(pred_cumsum >= target_value)

    return coef*(time_true - time_pred)

In [188]:
true_values = [2, 11, 84, 57, 0, 38, 15, 80, 4, 30, 90, 0, 0, 600]
pred_values = [52, 22, 95, 9, 11, 1, 73, 0, 3, 5, 10, 70, 50, 0]


print("Cumulative Time Difference:", ctd_score(true_values, pred_values, 300))

Cumulative Time Difference: -2


# 1. Autoregressive model

In [234]:
def ar_model(contributor):

    print(contributor['contributor'].iloc[0])

    # Spliting the data into training and testing sets
    train, test = split_activities(contributor)

    # Fit the model
    try:
        lags = [1, 12, 24, 168]
        model = AutoReg(train['n_activities'], lags=lags).fit()
        predictions = model.get_prediction(start=len(train), end=len(train)+len(test)-1).summary_frame(alpha=0.05)
    except IndexError:
        lags = [1, 12, 24]
        model = AutoReg(train['n_activities'], lags=lags).fit()
        predictions = model.get_prediction(start=len(train), end=len(train)+len(test)-1).summary_frame(alpha=0.05)
    except:
        lags = int(len(train)/2)-1
        model = AutoReg(train['n_activities'], lags=lags).fit()
        predictions = model.get_prediction(start=len(train), end=len(train)+len(test)-1).summary_frame(alpha=0.05)

    # Create a series for evaluation metrics and sum of activities
    metrics = pd.Series({
        'contributor': contributor['contributor'].iloc[0],
        'category': contributor['category'].iloc[0],
        'r2': r2_score(test['n_activities'], predictions['mean']),
        'mae': mean_absolute_error(test['n_activities'], predictions['mean']),
        'rmse': root_mean_squared_error(test['n_activities'], predictions['mean']),
        'pga': pga_score(test['n_activities'], predictions['mean']),
        'ctd_100': ctd_score(test['n_activities'], predictions['mean'], 100),
        'ctd_200': ctd_score(test['n_activities'], predictions['mean'], 200),
        'ctd_300': ctd_score(test['n_activities'], predictions['mean'], 300),
        'n_activities': train['n_activities'].sum(),
        'lags': lags,
        'true_values': test['n_activities'].values,
        'predicted_values': predictions['mean'].values,
    })

    return metrics

In [235]:
# Apply the function to each contributor
ar_results = data.groupby(['category', 'contributor']).apply(ar_model).reset_index(drop=True)

0crat
AppVeyorBot
Code-Inside-Bot
CrowdinBot
DrahtBot
PJBot
addonsbot
adobe-bot
agones-bot
alluxio-bot
analysis-bot
angular-automatic-lock-bot[bot]
ansibot
apmmachine
aporeto-bot
aws-cdk-automation
backportbot-nextcloud[bot]
ballerina-bot
bedevere-bot
bits-bot
blathers-crl[bot]
bluespice-github-bot
boring-cyborg[bot]
bors-servo
bot-gradle
carsonbot
cf-gitbot
cheminfo-bot
cla-bot[bot]
cmsdmwmbot
codeclimate[bot]
codesandbox[bot]
confs-tech-bot
cozy-bot
cypress-bot[bot]
delete-merged-branch[bot]
devOpsHazelcast
dlang-bot
docker-library-bot
dontcallmedom-bot
dotnet-bot
dotnet-issue-labeler[bot]
dotnet-maestro-bot
edx-requirements-bot
edx-transifex-bot
elife-bot
engine-flutter-autoroll
fire-bot
flinkbot
fluttergithubbot
forking-renovate[bot]
fossabot
garybot2
gatsby-cloud[bot]
getsentry-bot
gitguardian[bot]
gitpod-io[bot]
google-ml-butler[bot]
google-oss-bot
graalvmbot
graviteeio
guardrails[bot]
hashicorp-cla
hasura-bot
hft-team-city
ibmdotcom-bot
ionitron-bot[bot]
istio-policy-bot
jbosstm

In [236]:
ar_results.head()

Unnamed: 0,contributor,category,r2,mae,rmse,pga,ctd_100,ctd_200,ctd_300,n_activities,lags,true_values,predicted_values
0,0crat,bot,-0.667437,0.87824,1.105066,0.964117,-25,562,2170,299,"[1, 12, 24, 168]","[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.2923226999138115, 0.8512204488572581, 0.972..."
1,AppVeyorBot,bot,-0.014177,0.69351,0.818039,0.633333,31,86,95,300,"[1, 12, 24, 168]","[2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 2, 1, ...","[0.8093615164531129, 0.7579525565061894, 0.682..."
2,Code-Inside-Bot,bot,0.997601,0.008013,0.105915,0.839744,0,0,0,293,"[1, 12, 24, 168]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.3200161450011016e-16, 4.046242507715902e-16..."
3,CrowdinBot,bot,-0.001443,1.070054,4.638953,0.966387,-108,-311,-475,292,"[1, 12, 24, 168]","[0, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[-0.01033771945395856, 0.47303370894842045, 0...."
4,DrahtBot,bot,0.001107,0.515547,0.910024,0.800647,0,10,-927,299,"[1, 12, 24, 168]","[0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, ...","[0.3721921599465237, 0.28226559281655816, 0.41..."


In [237]:
ar_results.to_csv('../models-evaluation-v2/ar_model_metrics.csv', index=False)

# 2. Seasonal Autoregressive integrated Moving-average model

In [None]:
def sarima_model(contributor):

    print(contributor['contributor'].iloc[0])

    # Spliting the data into training and testing sets for time series forecasting, using a time-based split with split size = 0.9
    train, test = (
        contributor.apply(lambda x: x[:int(0.9*len(x))]),
        contributor.apply(lambda x: x[int(0.9*len(x)):])
    )

    # Set the frequency of the index to hourly
    train.index.freq = 'H'

    # Fit the model
    model = SARIMAX(train['n_activities'], order=(1, 0, 1), seasonal_order=(1, 0, 1, 24), enforce_invertibility=False, enforce_stationarity=False).fit(disp=False, method='lbfgs')

    # Forecast the test set using confidence interval with 95%
    predictions = model.get_prediction(start=len(train), end=len(train)+len(test)-1).summary_frame(alpha=0.05)

    # Create a series for evaluation metrics and sum of activities
    metrics = pd.Series({
        'contributor': contributor['contributor'].iloc[0],
        'category': contributor['category'].iloc[0],
        'r2': r2_score(test['n_activities'], predictions['mean']),
        'mae': mean_absolute_error(test['n_activities'], predictions['mean']),
        'rmse': root_mean_squared_error(test['n_activities'], predictions['mean']),
        'pga': pga_score(test['n_activities'], predictions['mean']),
        'pga_ci_upper': pga_score(test['n_activities'], predictions['mean_ci_upper']),
        'n_activities': contributor['n_activities'].sum(),
    })

    return metrics

In [None]:
# Apply the function to each contributor
sarima_results = data.groupby(['category', 'contributor']).apply(sarima_model).reset_index(drop=True)

In [None]:
sarima_results.head()

In [None]:
sarima_results['pga'].describe()

In [None]:
sarima_results.to_csv('../models-evaluation/sarima_model_metrics_ci.csv', index=False)

# 3. Unobserved components model

In [None]:
def uc_model(contributor):

    print(contributor['contributor'].iloc[0])

    # Spliting the data into training and testing sets for time series forecasting, using a time-based split with split size = 0.9
    train, test = (
        contributor.apply(lambda x: x[:int(0.9*len(x))]),
        contributor.apply(lambda x: x[int(0.9*len(x)):])
    )

    # Set the frequency of the index to hourly
    train.index.freq = 'H'

    # Fit the model
    model = UnobservedComponents(train['n_activities'], level=True, seasonal=24).fit(disp=False, method='lbfgs')

    # Forecast the test set using confidence interval with 95%
    predictions = model.get_prediction(start=len(train), end=len(train)+len(test)-1).summary_frame(alpha=0.05)

    # Create a series for evaluation metrics and sum of activities
    metrics = pd.Series({
        'contributor': contributor['contributor'].iloc[0],
        'category': contributor['category'].iloc[0],
        'r2': r2_score(test['n_activities'], predictions['mean']),
        'mae': mean_absolute_error(test['n_activities'], predictions['mean']),
        'rmse': root_mean_squared_error(test['n_activities'], predictions['mean']),
        'pga': pga_score(test['n_activities'], predictions['mean']),
        'pga_ci_upper': pga_score(test['n_activities'], predictions['mean_ci_upper']),
        'n_activities': contributor['n_activities'].sum(),
    })

    return metrics

In [None]:
# Apply the function to each contributor
uc_results = data.groupby(['category', 'contributor']).apply(uc_model).reset_index(drop=True)

In [None]:
uc_results.head()

In [None]:
uc_results['rmse'].describe()

In [None]:
uc_results.to_csv('../models-evaluation/uc_model_metrics_ci.csv', index=False)

# 4. Holt-Winters (triple) exponential smoothing model

In [None]:
def tes_model(contributor):

    print(contributor['contributor'].iloc[0])

    # Spliting the data into training and testing sets for time series forecasting, using a time-based split with split size = 0.9
    train, test = (
        contributor.apply(lambda x: x[:int(0.9*len(x))]),
        contributor.apply(lambda x: x[int(0.9*len(x)):])
    )

    # Set the frequency of the index to hourly
    train.index.freq = 'H'

    # Fit the model
    try:
        model = ETSModel(train['n_activities'], error='add', trend='add', seasonal='add', seasonal_periods=24).fit(disp=False)
    except ValueError:
        model = ETSModel(train['n_activities'], error='add', trend='add').fit(disp=False)
    except:
        print("Something else went wrong")

    # Forecast the test set using prediction interval with 95%
    predictions = model.get_prediction(start=len(train), end=len(train)+len(test)-1).summary_frame(alpha=0.05)

    # Create a series for evaluation metrics and sum of activities
    metrics = pd.Series({
        'contributor': contributor['contributor'].iloc[0],
        'category': contributor['category'].iloc[0],
        'r2': r2_score(test['n_activities'], predictions['mean']),
        'mae': mean_absolute_error(test['n_activities'], predictions['mean']),
        'rmse': root_mean_squared_error(test['n_activities'], predictions['mean']),
        'pga': pga_score(test['n_activities'], predictions['mean']),
        'pga_pi_upper': pga_score(test['n_activities'], predictions['pi_upper']),
        'n_activities': contributor['n_activities'].sum()
    })

    return metrics

In [None]:
# Apply the function to each contributor
tes_results = data.groupby(['category', 'contributor']).apply(tes_model).reset_index(drop=True)

In [None]:
tes_results.head()

In [None]:
tes_results['rmse'].describe()

In [None]:
tes_results.to_csv('../models-evaluation/tes_model_metrics_pi.csv', index=False)

# Models comparing

In [None]:
ar_results = pd.read_csv('../models-evaluation/ar_model_metrics_ci.csv')
sarima_results = pd.read_csv('../models-evaluation/sarima_model_metrics_ci.csv')
uc_results = pd.read_csv('../models-evaluation/uc_model_metrics_ci.csv')
tes_results = pd.read_csv('../models-evaluation/tes_model_metrics_pi.csv')

In [None]:
# create dataframes with the given data
data1 = ar_results[['category', 'mae', 'rmse']]
data2 = sarima_results[['category', 'mae', 'rmse']]
data3 = uc_results[['category', 'mae', 'rmse']]
data4 = tes_results[['category', 'mae', 'rmse']]

# melt the dataframes
melted_data1 = pd.melt(data1, id_vars=['category'], var_name='metric', value_name='value')
melted_data2 = pd.melt(data2, id_vars=['category'], var_name='metric', value_name='value')
melted_data3 = pd.melt(data3, id_vars=['category'], var_name='metric', value_name='value')
melted_data4 = pd.melt(data4, id_vars=['category'], var_name='metric', value_name='value')

# create subplots
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))

# plot the boxenplots
sns.boxenplot(ax=axs[0, 0], x='metric', y='value', hue='category', data=melted_data1, showfliers=False)
sns.boxenplot(ax=axs[0, 1], x='metric', y='value', hue='category', data=melted_data2, showfliers=False)
sns.boxenplot(ax=axs[1, 0], x='metric', y='value', hue='category', data=melted_data3, showfliers=False)
sns.boxenplot(ax=axs[1, 1], x='metric', y='value', hue='category', data=melted_data4, showfliers=False)

# set the titles
axs[0, 0].set_title('AutoReg')
axs[0, 1].set_title('SARIMA')
axs[1, 0].set_title('UC')
axs[1, 1].set_title('TES')

# set the y-label
axs[0, 0].set_ylabel('Median')
axs[0, 1].set_ylabel('Median')
axs[1, 0].set_ylabel('Median')
axs[1, 1].set_ylabel('Median')

# set the plot title
plt.suptitle('Boxenplot of the median of MAE and RMSE')

# show the plot
plt.show()


In [None]:
# create dataframes with the given data
data1 = ar_results[['category', 'pga', 'pga_ci_upper']]
data2 = sarima_results[['category', 'pga', 'pga_ci_upper']]
data3 = uc_results[['category', 'pga', 'pga_ci_upper']]
data4 = tes_results[['category', 'pga', 'pga_pi_upper']]

# melt the dataframes
melted_data1 = pd.melt(data1, id_vars=['category'], var_name='metric', value_name='value')
melted_data2 = pd.melt(data2, id_vars=['category'], var_name='metric', value_name='value')
melted_data3 = pd.melt(data3, id_vars=['category'], var_name='metric', value_name='value')
melted_data4 = pd.melt(data4, id_vars=['category'], var_name='metric', value_name='value')

# create subplots
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))

# plot the boxenplots
sns.boxenplot(ax=axs[0, 0], x='metric', y='value', hue='category', data=melted_data1, showfliers=False)
sns.boxenplot(ax=axs[0, 1], x='metric', y='value', hue='category', data=melted_data2, showfliers=False)
sns.boxenplot(ax=axs[1, 0], x='metric', y='value', hue='category', data=melted_data3, showfliers=False)
sns.boxenplot(ax=axs[1, 1], x='metric', y='value', hue='category', data=melted_data4, showfliers=False)

# set the titles
axs[0, 0].set_title('AutoReg')
axs[0, 1].set_title('SARIMA')
axs[1, 0].set_title('UC')
axs[1, 1].set_title('TES')

# set the y-label
axs[0, 0].set_ylabel('Median')
axs[0, 1].set_ylabel('Median')
axs[1, 0].set_ylabel('Median')
axs[1, 1].set_ylabel('Median')

# set the plot title
plt.suptitle('Boxenplot of the median of PGA')

# show the plot
plt.show()


In [None]:
# create dataframes with the given data
data1 = ar_results[['category', 'r2']]
data2 = sarima_results[['category', 'r2']]
data3 = uc_results[['category', 'r2']]
data4 = tes_results[['category', 'r2']]

# melt the dataframes
melted_data1 = pd.melt(data1, id_vars=['category'], var_name='metric', value_name='value')
melted_data2 = pd.melt(data2, id_vars=['category'], var_name='metric', value_name='value')
melted_data3 = pd.melt(data3, id_vars=['category'], var_name='metric', value_name='value')
melted_data4 = pd.melt(data4, id_vars=['category'], var_name='metric', value_name='value')

# create subplots
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))

# plot the boxenplots
sns.boxenplot(ax=axs[0, 0], x='metric', y='value', hue='category', data=melted_data1, showfliers=False)
sns.boxenplot(ax=axs[0, 1], x='metric', y='value', hue='category', data=melted_data2, showfliers=False)
sns.boxenplot(ax=axs[1, 0], x='metric', y='value', hue='category', data=melted_data3, showfliers=False)
sns.boxenplot(ax=axs[1, 1], x='metric', y='value', hue='category', data=melted_data4, showfliers=False)

# set the titles
axs[0, 0].set_title('AutoReg')
axs[0, 1].set_title('SARIMA')
axs[1, 0].set_title('UC')
axs[1, 1].set_title('TES')

# set the y-label
axs[0, 0].set_ylabel('Median')
axs[0, 1].set_ylabel('Median')
axs[1, 0].set_ylabel('Median')
axs[1, 1].set_ylabel('Median')

# set the plot title
plt.suptitle('Boxenplot of the median of R2')

# show the plot
plt.show()
