In this notebook, I will compare between difference forcasting models, starting from simple linear to arima models.

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
activities = pd.read_parquet('../data-raw/activities.parquet')

In [3]:
activities

Unnamed: 0,contributor,category,repository,activity,date
0,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:19+00:00
1,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:23+00:00
2,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:26+00:00
3,analysis-bot,bot,facebook/react-native,Commenting pull request,2022-11-25 09:55:27+00:00
4,neos-bot,bot,neos/neos-ui-compiled,Pushing commits,2022-11-25 09:55:47+00:00
...,...,...,...,...,...
1015418,798388,human,879434,Reviewing code,2023-04-15 16:06:15+00:00
1015419,798388,human,879434,Reviewing code,2023-04-15 16:07:26+00:00
1015420,784775,human,643744,Creating branch,2023-04-15 16:07:33+00:00
1015421,784775,human,888378,Opening pull request,2023-04-15 16:08:07+00:00


# Naive model

We forecast involves using the previous observations (same date, same time) to predict the next time step (one week for train and w=one for test)

In [4]:
activities_by_time = (
    # extract data just for 2 weeks
    activities[(activities['date'] >= '2022-12-01 00:00:00+00:00') & (activities['date'] < '2022-12-15 00:00:00+00:00')]
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [5]:
activities_by_time.head(48)

Unnamed: 0,contributor,category,datetime,n_activities
0,0crat,bot,2022-12-01 00:00:00,5
1,0crat,bot,2022-12-01 01:00:00,0
2,0crat,bot,2022-12-01 02:00:00,5
3,0crat,bot,2022-12-01 03:00:00,8
4,0crat,bot,2022-12-01 04:00:00,0
5,0crat,bot,2022-12-01 05:00:00,0
6,0crat,bot,2022-12-01 06:00:00,0
7,0crat,bot,2022-12-01 07:00:00,0
8,0crat,bot,2022-12-01 08:00:00,0
9,0crat,bot,2022-12-01 09:00:00,2


In [27]:
train_activities = activities_by_time[activities_by_time['datetime'] < '2022-12-08 00:00:00'].reset_index(drop=True)
test_activities = activities_by_time[activities_by_time['datetime'] >= '2022-12-08 00:00:00'].reset_index(drop=True)

In [29]:
len(train_activities), len(test_activities)

(132048, 132048)

In [30]:
test_activities['p_activities'] = train_activities['n_activities']

In [62]:
test_activities.sample(10)

Unnamed: 0,contributor,category,datetime,n_activities,p_activities
101020,google-oss-robot,bot,2022-12-10 04:00:00,0,0
108035,mesosphere-ci,bot,2022-12-08 11:00:00,0,0
126514,thewca-bot,bot,2022-12-08 10:00:00,0,0
90246,chainer-ci,bot,2022-12-09 06:00:00,0,0
42130,689788,human,2022-12-13 10:00:00,0,0
123021,sourcegraph-bot,bot,2022-12-09 21:00:00,14,9
77165,987894,human,2022-12-10 05:00:00,0,0
119894,rultor,bot,2022-12-12 14:00:00,8,10
46278,744399,human,2022-12-11 06:00:00,0,0
126589,thewca-bot,bot,2022-12-11 13:00:00,0,0


In [74]:
def calculate_metrics(actual, predicted):
    r2 = r2_score(actual, predicted)
    mae = mean_absolute_error(actual, predicted) 
    mse = mean_squared_error(actual, predicted)
    sum_n_activities = actual.sum()
    return pd.Series({'r2': r2, 'mae': mae, 'mse': mse, 'n_activities':sum_n_activities})

# Group by contributor and calculate metrics
result = test_activities.groupby(['contributor', 'category']).apply(lambda x: calculate_metrics(x['n_activities'], x['p_activities'])).reset_index()

In [82]:
result.sample(40)

Unnamed: 0,contributor,category,r2,mae,mse,n_activities
579,explosion-bot,bot,-0.091049,0.160714,0.60119,24.0
612,hsimbot,bot,-1.090762,0.595238,6.988095,51.0
758,tykbot[bot],bot,-1.598466,0.869048,5.892857,71.0
2,333363,human,-0.212996,0.02381,0.035714,3.0
433,968694,human,-0.005988,0.005952,0.005952,1.0
219,658863,human,-0.492007,0.095238,0.119048,10.0
164,567396,human,0.0,0.166667,0.583333,0.0
25,363447,human,-0.140569,0.386905,1.755952,54.0
247,679979,human,-6.008589,0.083333,0.202381,5.0
348,849576,human,-4.094211,0.14881,1.113095,7.0
