# Libraries and data importing

Importing packages

In [1]:
import pandas as pd

In [None]:
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing

Importing data

In [2]:
activities = pd.read_parquet('../data-raw/activities.parquet')
activities

Unnamed: 0,contributor,category,repository,activity,date
0,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:19+00:00
1,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:23+00:00
2,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:26+00:00
3,analysis-bot,bot,facebook/react-native,Commenting pull request,2022-11-25 09:55:27+00:00
4,neos-bot,bot,neos/neos-ui-compiled,Pushing commits,2022-11-25 09:55:47+00:00
...,...,...,...,...,...
1015418,798388,human,879434,Reviewing code,2023-04-15 16:06:15+00:00
1015419,798388,human,879434,Reviewing code,2023-04-15 16:07:26+00:00
1015420,784775,human,643744,Creating branch,2023-04-15 16:07:33+00:00
1015421,784775,human,888378,Opening pull request,2023-04-15 16:08:07+00:00


Calculating the date 3 months ago from the last date in the data and filters the dataframe to include only the rows with dates greater than or equal to the date 3 months ago

In [3]:
thresholded_activities = (
    activities[activities['date'] >= activities['date'].max() - pd.DateOffset(months=3)]
    .groupby('contributor')
    .tail(300)
    .groupby(['category', pd.Grouper(key='date', freq='H'), 'contributor'])['activity']
    .count()
    .reset_index(name='n_activities')
    # keep data for contributors who have more than 10 different hour sequences
    .groupby('contributor')
    .filter(lambda x: len(x) >= 10)
)
thresholded_activities

Unnamed: 0,category,date,contributor,n_activities
0,bot,2023-01-15 16:00:00+00:00,AmplabJenkins,1
1,bot,2023-01-15 17:00:00+00:00,apollo-cla,1
2,bot,2023-01-15 17:00:00+00:00,stickler-ci[bot],1
3,bot,2023-01-15 18:00:00+00:00,AmplabJenkins,2
5,bot,2023-01-15 18:00:00+00:00,mysql-oca-bot,1
...,...,...,...,...
53322,human,2023-04-15 15:00:00+00:00,947579,6
53323,human,2023-04-15 16:00:00+00:00,387854,2
53324,human,2023-04-15 16:00:00+00:00,668373,1
53325,human,2023-04-15 16:00:00+00:00,784775,4


Filling n_activities with zeros for the empty hours between the minimum and maximum date for each contributor, and sets the category and contributor columns to a default values 

In [4]:
data = (
    thresholded_activities
    .groupby(['category','contributor'])
    .apply(
        lambda x: x
        .set_index('date')
        # resample data to fill in missing hours with zeros
        .resample('H')
        .sum()
        .fillna(0)
        .reset_index()
        # set the 'category' and 'contributor' columns to the values of the data group
        .assign(category=lambda y: y.iloc[0]['category'], contributor=lambda y: y.iloc[0]['contributor'])
        )
    .set_index('date')
    .rename_axis(None)
)
data

Unnamed: 0,category,contributor,n_activities
2023-01-18 10:00:00+00:00,bot,0crat,1
2023-01-18 11:00:00+00:00,bot,0crat,12
2023-01-18 12:00:00+00:00,bot,0crat,3
2023-01-18 13:00:00+00:00,bot,0crat,6
2023-01-18 14:00:00+00:00,bot,0crat,2
...,...,...,...
2023-04-06 05:00:00+00:00,human,999769,0
2023-04-06 06:00:00+00:00,human,999769,0
2023-04-06 07:00:00+00:00,human,999769,2
2023-04-06 08:00:00+00:00,human,999769,0


Spliting the data into training and testing sets for time series forecasting, using a time-based split with split size = 0.9

In [8]:
train, test = (
    data
    .groupby(['category', 'contributor'], group_keys=False)
    .apply(lambda x: x[:int(0.9*len(x))]),

    data
    .groupby(['category', 'contributor'], group_keys=False)
    .apply(lambda x: x[int(0.9*len(x)):])
)

In [16]:
train

Unnamed: 0,category,contributor,n_activities
2023-01-18 10:00:00+00:00,bot,0crat,1
2023-01-18 11:00:00+00:00,bot,0crat,12
2023-01-18 12:00:00+00:00,bot,0crat,3
2023-01-18 13:00:00+00:00,bot,0crat,6
2023-01-18 14:00:00+00:00,bot,0crat,2
...,...,...,...
2023-03-29 14:00:00+00:00,human,999769,0
2023-03-29 15:00:00+00:00,human,999769,0
2023-03-29 16:00:00+00:00,human,999769,0
2023-03-29 17:00:00+00:00,human,999769,0


In [17]:
test

Unnamed: 0,category,contributor,n_activities
2023-03-24 10:00:00+00:00,bot,0crat,0
2023-03-24 11:00:00+00:00,bot,0crat,0
2023-03-24 12:00:00+00:00,bot,0crat,0
2023-03-24 13:00:00+00:00,bot,0crat,0
2023-03-24 14:00:00+00:00,bot,0crat,0
...,...,...,...
2023-04-06 05:00:00+00:00,human,999769,0
2023-04-06 06:00:00+00:00,human,999769,0
2023-04-06 07:00:00+00:00,human,999769,2
2023-04-06 08:00:00+00:00,human,999769,0


# 1. Autoregressive model

# 2. Moving-average model

# 3. Autoregressive Moving-average model

# 4. Autoregressive integrated Moving-average model

# 5. Seasonal Autoregressive integrated Moving-average with eXogenous regressors model

# 6. Vector Autoregressive model

# 7. Vector Autoregressive Moving-average with eXogenous regressors model

# 8. Unobserved components model

# 9. Dynamic factor models

# 10. Simple exponential smoothing

# 11. Holt’s linear exponential smoothing

# 12. Holt-Winters exponential smoothing