In this notebook, I will compare between difference forcasting models, starting from simple linear to arima models.

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [2]:
activities = pd.read_parquet('../data-raw/activities.parquet')

In [None]:
activities

# Naive model

We forecast involves using the previous observations (same date, same time) to predict the next time step (one week for train and w=one for test)

In [13]:
activities_by_time = (
    # extract data just for 2 weeks
    activities[(activities['date'] >= '2023-02-01 00:00:00+00:00') & (activities['date'] < '2023-02-15 00:00:00+00:00')]
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [14]:
activities[(activities['date'] >= '2022-02-01 00:00:00+00:00') & (activities['date'] < '2022-02-15 00:00:00+00:00')]

Unnamed: 0,contributor,category,repository,activity,date


In [15]:
activities_by_time.head(48)

Unnamed: 0,contributor,category,datetime,n_activities
0,0crat,bot,2023-02-01 00:00:00,0
1,0crat,bot,2023-02-01 01:00:00,0
2,0crat,bot,2023-02-01 02:00:00,0
3,0crat,bot,2023-02-01 03:00:00,0
4,0crat,bot,2023-02-01 04:00:00,0
5,0crat,bot,2023-02-01 05:00:00,0
6,0crat,bot,2023-02-01 06:00:00,0
7,0crat,bot,2023-02-01 07:00:00,0
8,0crat,bot,2023-02-01 08:00:00,0
9,0crat,bot,2023-02-01 09:00:00,0


In [16]:
train_activities = activities_by_time[activities_by_time['datetime'] < '2023-02-08 00:00:00'].reset_index(drop=True)
test_activities = activities_by_time[activities_by_time['datetime'] >= '2023-02-08 00:00:00'].reset_index(drop=True)

In [17]:
len(train_activities), len(test_activities)

(128856, 128856)

In [18]:
test_activities['p_activities'] = train_activities['n_activities']

In [19]:
test_activities.sample(10)

Unnamed: 0,contributor,category,datetime,n_activities,p_activities
121780,status-im-auto,bot,2023-02-14 04:00:00,1,0
30573,579369,human,2023-02-14 21:00:00,0,0
63484,869378,human,2023-02-14 04:00:00,0,0
91196,cla-bot-2021[bot],bot,2023-02-13 20:00:00,0,0
100455,gitter-badger,bot,2023-02-14 15:00:00,0,0
103605,jbosstm-bot,bot,2023-02-12 21:00:00,0,0
127609,wet-boew-bot,bot,2023-02-12 01:00:00,0,0
55160,783656,human,2023-02-10 08:00:00,0,0
59554,839884,human,2023-02-11 10:00:00,0,0
100361,gitter-badger,bot,2023-02-10 17:00:00,0,0


In [20]:
def calculate_metrics(actual, predicted):
    r2 = r2_score(actual, predicted)
    mae = mean_absolute_error(actual, predicted) 
    mse = mean_squared_error(actual, predicted)
    sum_n_activities = actual.sum()
    return pd.Series({'r2': r2, 'mae': mae, 'mse': mse, 'n_activities':sum_n_activities})

# Group by contributor and calculate metrics
result = test_activities.groupby(['contributor', 'category']).apply(lambda x: calculate_metrics(x['n_activities'], x['p_activities'])).reset_index()

In [21]:
result.sort_values(by='r2', ascending=False).head(40)

Unnamed: 0,contributor,category,r2,mae,mse,n_activities
750,vscode-issue-tracker-bot,bot,1.0,0.0,0.0,168.0
521,aws-sdk-go-automation,bot,1.0,0.0,0.0,25.0
663,octokit-fixture-user-a,bot,1.0,0.0,0.0,259.0
485,Code-Inside-Bot,bot,1.0,0.0,0.0,84.0
658,ninjadotorg-bot,bot,1.0,0.0,0.0,168.0
683,pxw-bot,bot,1.0,0.0,0.0,14.0
739,translatewiki,bot,0.974459,0.083333,1.011905,116.0
572,edx-transifex-bot,bot,0.950579,0.113095,0.39881,77.0
571,edx-requirements-bot,bot,0.85671,1.357143,17.214286,467.0
685,pytorchbot,bot,0.850541,0.25,0.440476,118.0


In [22]:
result.to_csv('../eval/naive_model_metrics.csv', index=False)

# Regression Model

In [3]:
activities_by_time = (
    activities
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [4]:
activities_by_time

Unnamed: 0,contributor,category,datetime,n_activities
0,0crat,bot,2022-11-25 09:00:00,0
1,0crat,bot,2022-11-25 10:00:00,0
2,0crat,bot,2022-11-25 11:00:00,1
3,0crat,bot,2022-11-25 12:00:00,1
4,0crat,bot,2022-11-25 13:00:00,6
...,...,...,...,...
3290235,zorro-bot[bot],bot,2023-04-15 12:00:00,0
3290236,zorro-bot[bot],bot,2023-04-15 13:00:00,0
3290237,zorro-bot[bot],bot,2023-04-15 14:00:00,0
3290238,zorro-bot[bot],bot,2023-04-15 15:00:00,0


In [5]:
#  create lag features
def create_lag_features(group):
    n_previous_times = 168
    for i in range(1, n_previous_times + 1):
        group[f'n_activities_lag_{i}'] = group['n_activities'].shift(i)
    return group.iloc[n_previous_times:].reset_index(drop=True)

In [None]:
# apply the function to each group
laged_activities = activities_by_time.groupby(['contributor', 'category']).apply(create_lag_features).reset_index(drop=True)

In [7]:
laged_activities[['contributor','category','datetime','n_activities','n_activities_lag_1','n_activities_lag_2','n_activities_lag_3']].head(20)

Unnamed: 0,contributor,category,datetime,n_activities,n_activities_lag_1,n_activities_lag_2,n_activities_lag_3
0,0crat,bot,2022-12-02 09:00:00,0,5.0,4.0,4.0
1,0crat,bot,2022-12-02 10:00:00,2,0.0,5.0,4.0
2,0crat,bot,2022-12-02 11:00:00,0,2.0,0.0,5.0
3,0crat,bot,2022-12-02 12:00:00,0,0.0,2.0,0.0
4,0crat,bot,2022-12-02 13:00:00,0,0.0,0.0,2.0
5,0crat,bot,2022-12-02 14:00:00,0,0.0,0.0,0.0
6,0crat,bot,2022-12-02 15:00:00,0,0.0,0.0,0.0
7,0crat,bot,2022-12-02 16:00:00,0,0.0,0.0,0.0
8,0crat,bot,2022-12-02 17:00:00,0,0.0,0.0,0.0
9,0crat,bot,2022-12-02 18:00:00,0,0.0,0.0,0.0


In [8]:
# Function to evaluate the model and return metrics for each contributor
def evaluate_metrics(group_data):
    train_data = group_data[group_data['datetime'] < '2023-04-01 00:00:00'].reset_index(drop=True)
    test_data = group_data[group_data['datetime'] >= '2023-04-01 00:00:00'].reset_index(drop=True)

    # Separate features and target variable
    X_train = train_data.drop(['contributor', 'category', 'datetime', 'n_activities'], axis=1)
    y_train = train_data['n_activities']

    X_test = test_data.drop(['contributor', 'category', 'datetime', 'n_activities'], axis=1)
    y_test = test_data['n_activities']

    # Train the time series forecasting model with multiple linear regression
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)

    # Create a DataFrame for evaluation metrics and sum of activities
    evaluation_metrics = pd.Series({
        'contributor': group_data['contributor'].iloc[0],  # Use the first value since it's the same for the group
        'category': group_data['category'].iloc[0],  # Use the first value since it's the same for the group
        'r2': r2,
        'mae': mae,
        'mse': mse,
        'n_activities': test_data['n_activities'].sum()
    })

    return evaluation_metrics


In [9]:
# Apply the function to each group and concatenate the results
result = laged_activities.groupby(['contributor', 'category']).apply(evaluate_metrics).reset_index(drop=True)

In [10]:
result.sort_values(by='r2', ascending=False).head(40)

Unnamed: 0,contributor,category,r2,mae,mse,n_activities
727,doorkeeper-bot,bot,1.0,0.0,0.0,0
917,staticman-net[bot],bot,1.0,0.0,0.0,0
487,875556,human,1.0,0.0,0.0,0
704,close-issue-app[bot],bot,1.0,0.0,0.0,0
218,575339,human,1.0,0.0,0.0,0
155,483968,human,1.0,0.0,0.0,0
642,LottieSnapshotBot,bot,1.0,0.0,0.0,0
869,pxw-bot,bot,0.999995,0.00087,7.76349e-07,30
630,Code-Inside-Bot,bot,0.996156,0.042385,0.01808059,176
843,octokit-fixture-user-a,bot,0.909236,0.1292,0.3219482,130


In [12]:
result.to_csv('../eval/reg_model_metrics.csv', index=False)