In this notebook, I will compare between difference forcasting models, starting from simple linear to arima models.

In [145]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [91]:
activities = pd.read_parquet('../data-raw/activities.parquet')

In [92]:
activities

Unnamed: 0,contributor,category,repository,activity,date
0,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:19+00:00
1,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:23+00:00
2,AmplabJenkins,bot,apache/spark,Commenting pull request,2022-11-25 09:55:26+00:00
3,analysis-bot,bot,facebook/react-native,Commenting pull request,2022-11-25 09:55:27+00:00
4,neos-bot,bot,neos/neos-ui-compiled,Pushing commits,2022-11-25 09:55:47+00:00
...,...,...,...,...,...
1015418,798388,human,879434,Reviewing code,2023-04-15 16:06:15+00:00
1015419,798388,human,879434,Reviewing code,2023-04-15 16:07:26+00:00
1015420,784775,human,643744,Creating branch,2023-04-15 16:07:33+00:00
1015421,784775,human,888378,Opening pull request,2023-04-15 16:08:07+00:00


# Naive model

We forecast involves using the previous observations (same date, same time) to predict the next time step (one week for train and w=one for test)

In [94]:
activities_by_time = (
    # extract data just for 2 weeks
    activities[(activities['date'] >= '2023-02-01 00:00:00+00:00') & (activities['date'] < '2023-02-15 00:00:00+00:00')]
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [93]:
activities[(activities['date'] >= '2022-02-01 00:00:00+00:00') & (activities['date'] < '2022-02-15 00:00:00+00:00')]

Unnamed: 0,contributor,category,repository,activity,date


In [95]:
activities_by_time.head(48)

Unnamed: 0,contributor,category,datetime,n_activities
0,0crat,bot,2023-02-01 00:00:00,0
1,0crat,bot,2023-02-01 01:00:00,0
2,0crat,bot,2023-02-01 02:00:00,0
3,0crat,bot,2023-02-01 03:00:00,0
4,0crat,bot,2023-02-01 04:00:00,0
5,0crat,bot,2023-02-01 05:00:00,0
6,0crat,bot,2023-02-01 06:00:00,0
7,0crat,bot,2023-02-01 07:00:00,0
8,0crat,bot,2023-02-01 08:00:00,0
9,0crat,bot,2023-02-01 09:00:00,0


In [96]:
train_activities = activities_by_time[activities_by_time['datetime'] < '2023-02-08 00:00:00'].reset_index(drop=True)
test_activities = activities_by_time[activities_by_time['datetime'] >= '2023-02-08 00:00:00'].reset_index(drop=True)

In [97]:
len(train_activities), len(test_activities)

(128856, 128856)

In [98]:
test_activities['p_activities'] = train_activities['n_activities']

In [104]:
test_activities.sample(10)

Unnamed: 0,contributor,category,datetime,n_activities,p_activities
5618,366635,human,2023-02-11 02:00:00,0,0
82753,PJBot,bot,2023-02-12 01:00:00,0,0
40712,669693,human,2023-02-10 08:00:00,0,0
100637,google-oss-bot,bot,2023-02-08 05:00:00,21,10
85701,angular-automatic-lock-bot[bot],bot,2023-02-08 21:00:00,0,0
97918,flinkbot,bot,2023-02-13 22:00:00,1,1
89553,bot-gradle,bot,2023-02-08 09:00:00,1,7
4864,364454,human,2023-02-14 16:00:00,0,0
56470,788349,human,2023-02-08 22:00:00,0,0
75003,974364,human,2023-02-11 03:00:00,0,0


In [105]:
def calculate_metrics(actual, predicted):
    r2 = r2_score(actual, predicted)
    mae = mean_absolute_error(actual, predicted) 
    mse = mean_squared_error(actual, predicted)
    sum_n_activities = actual.sum()
    return pd.Series({'r2': r2, 'mae': mae, 'mse': mse, 'n_activities':sum_n_activities})

# Group by contributor and calculate metrics
result = test_activities.groupby(['contributor', 'category']).apply(lambda x: calculate_metrics(x['n_activities'], x['p_activities'])).reset_index()

In [109]:
result.sort_values(by='r2', ascending=False).head(40)

Unnamed: 0,contributor,category,r2,mae,mse,n_activities
750,vscode-issue-tracker-bot,bot,1.0,0.0,0.0,168.0
521,aws-sdk-go-automation,bot,1.0,0.0,0.0,25.0
663,octokit-fixture-user-a,bot,1.0,0.0,0.0,259.0
485,Code-Inside-Bot,bot,1.0,0.0,0.0,84.0
658,ninjadotorg-bot,bot,1.0,0.0,0.0,168.0
683,pxw-bot,bot,1.0,0.0,0.0,14.0
739,translatewiki,bot,0.974459,0.083333,1.011905,116.0
572,edx-transifex-bot,bot,0.950579,0.113095,0.39881,77.0
571,edx-requirements-bot,bot,0.85671,1.357143,17.214286,467.0
685,pytorchbot,bot,0.850541,0.25,0.440476,118.0


# Regression Model

In [113]:
activities_by_time = (
    activities
    .assign(datetime=activities['date'].dt.strftime('%Y-%m-%d %H:00:00'))
    .groupby(['contributor', 'category', 'datetime'])
    .activity.count()
    .unstack(fill_value=0).stack()
    .reset_index(name='n_activities')
)

In [119]:
activities_by_time

Unnamed: 0,contributor,category,datetime,n_activities
0,0crat,bot,2022-11-25 09:00:00,0
1,0crat,bot,2022-11-25 10:00:00,0
2,0crat,bot,2022-11-25 11:00:00,1
3,0crat,bot,2022-11-25 12:00:00,1
4,0crat,bot,2022-11-25 13:00:00,6
...,...,...,...,...
3290235,zorro-bot[bot],bot,2023-04-15 12:00:00,0
3290236,zorro-bot[bot],bot,2023-04-15 13:00:00,0
3290237,zorro-bot[bot],bot,2023-04-15 14:00:00,0
3290238,zorro-bot[bot],bot,2023-04-15 15:00:00,0


In [None]:
# Get the data of the top contributor 'codeclimate[bot]' to test time series forcasting method
temp = (
    activities_by_day[activities_by_day['contributor'] == 'codeclimate[bot]']
    .drop(['contributor'], axis=1)
    .reset_index(drop=True)
)

# Feature Engineering: create lag features based on the n previous dates
n_previous_dates = 13
for i in range(1, n_previous_dates+1):
    temp[f'n_activities_lag_{i}'] = temp['n_activities'].shift(i)

# Delete the n first rows to avoid NaN values
temp = temp.iloc[n_previous_dates:].reset_index(drop=True)

In [114]:
#  create lag features
def create_lag_features(group):
    n_previous_times = 168
    for i in range(1, n_previous_times + 1):
        group[f'n_activities_lag_{i}'] = group['n_activities'].shift(i)
    return group.iloc[n_previous_times:].reset_index(drop=True)

In [None]:
# apply the function to each group
laged_activities = activities_by_time.groupby(['contributor', 'category']).apply(create_lag_features).reset_index(drop=True)

In [125]:
laged_activities.head()

Unnamed: 0,contributor,category,datetime,n_activities,n_activities_lag_1,n_activities_lag_2,n_activities_lag_3,n_activities_lag_4,n_activities_lag_5,n_activities_lag_6,...,n_activities_lag_159,n_activities_lag_160,n_activities_lag_161,n_activities_lag_162,n_activities_lag_163,n_activities_lag_164,n_activities_lag_165,n_activities_lag_166,n_activities_lag_167,n_activities_lag_168
0,0crat,bot,2022-12-02 09:00:00,0,5.0,4.0,4.0,7.0,3.0,6.0,...,0.0,2.0,8.0,10.0,12.0,6.0,1.0,1.0,0.0,0.0
1,0crat,bot,2022-12-02 10:00:00,2,0.0,5.0,4.0,4.0,7.0,3.0,...,0.0,0.0,2.0,8.0,10.0,12.0,6.0,1.0,1.0,0.0
2,0crat,bot,2022-12-02 11:00:00,0,2.0,0.0,5.0,4.0,4.0,7.0,...,0.0,0.0,0.0,2.0,8.0,10.0,12.0,6.0,1.0,1.0
3,0crat,bot,2022-12-02 12:00:00,0,0.0,2.0,0.0,5.0,4.0,4.0,...,0.0,0.0,0.0,0.0,2.0,8.0,10.0,12.0,6.0,1.0
4,0crat,bot,2022-12-02 13:00:00,0,0.0,0.0,2.0,0.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,2.0,8.0,10.0,12.0,6.0


In [181]:
laged_activities[['contributor','category','datetime','n_activities','n_activities_lag_1','n_activities_lag_2','n_activities_lag_3']].sample(10)

Unnamed: 0,contributor,category,datetime,n_activities,n_activities_lag_1,n_activities_lag_2,n_activities_lag_3
1033086,683945,human,2023-01-29 23:00:00,0,0.0,0.0,0.0
2997936,taichi-gardener,bot,2023-03-30 17:00:00,0,1.0,0.0,0.0
713423,576539,human,2023-01-09 16:00:00,0,11.0,0.0,0.0
2038833,DrahtBot,bot,2023-01-24 02:00:00,1,0.0,1.0,0.0
2500288,hacs-bot[bot],bot,2023-02-10 17:00:00,0,0.0,0.0,0.0
1989748,997737,human,2022-12-24 21:00:00,0,0.0,0.0,0.0
2823669,rails-bot[bot],bot,2023-03-23 14:00:00,0,0.0,0.0,0.0
2788259,poggit-bot,bot,2023-03-25 20:00:00,9,0.0,0.0,0.0
2851049,request-info[bot],bot,2023-01-14 10:00:00,0,0.0,0.0,0.0
2745308,openmrs-bot,bot,2023-02-10 13:00:00,0,0.0,0.0,4.0


In [127]:
train_activities = laged_activities[laged_activities['datetime'] < '2023-04-01 00:00:00'].reset_index(drop=True)
test_activities = laged_activities[laged_activities['datetime'] >= '2023-04-01 00:00:00'].reset_index(drop=True)

In [148]:
train_activities

Unnamed: 0,contributor,category,datetime,n_activities,n_activities_lag_1,n_activities_lag_2,n_activities_lag_3,n_activities_lag_4,n_activities_lag_5,n_activities_lag_6,...,n_activities_lag_159,n_activities_lag_160,n_activities_lag_161,n_activities_lag_162,n_activities_lag_163,n_activities_lag_164,n_activities_lag_165,n_activities_lag_166,n_activities_lag_167,n_activities_lag_168
0,0crat,bot,2022-12-02 09:00:00,0,5.0,4.0,4.0,7.0,3.0,6.0,...,0.0,2.0,8.0,10.0,12.0,6.0,1.0,1.0,0.0,0.0
1,0crat,bot,2022-12-02 10:00:00,2,0.0,5.0,4.0,4.0,7.0,3.0,...,0.0,0.0,2.0,8.0,10.0,12.0,6.0,1.0,1.0,0.0
2,0crat,bot,2022-12-02 11:00:00,0,2.0,0.0,5.0,4.0,4.0,7.0,...,0.0,0.0,0.0,2.0,8.0,10.0,12.0,6.0,1.0,1.0
3,0crat,bot,2022-12-02 12:00:00,0,0.0,2.0,0.0,5.0,4.0,4.0,...,0.0,0.0,0.0,0.0,2.0,8.0,10.0,12.0,6.0,1.0
4,0crat,bot,2022-12-02 13:00:00,0,0.0,0.0,2.0,0.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,2.0,8.0,10.0,12.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2784865,zorro-bot[bot],bot,2023-03-31 19:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2784866,zorro-bot[bot],bot,2023-03-31 20:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2784867,zorro-bot[bot],bot,2023-03-31 21:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2784868,zorro-bot[bot],bot,2023-03-31 22:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
test_activities

Unnamed: 0,contributor,category,datetime,n_activities,n_activities_lag_1,n_activities_lag_2,n_activities_lag_3,n_activities_lag_4,n_activities_lag_5,n_activities_lag_6,...,n_activities_lag_159,n_activities_lag_160,n_activities_lag_161,n_activities_lag_162,n_activities_lag_163,n_activities_lag_164,n_activities_lag_165,n_activities_lag_166,n_activities_lag_167,n_activities_lag_168
0,0crat,bot,2023-04-01 00:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0crat,bot,2023-04-01 01:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0crat,bot,2023-04-01 02:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0crat,bot,2023-04-01 03:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0crat,bot,2023-04-01 04:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342405,zorro-bot[bot],bot,2023-04-15 12:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342406,zorro-bot[bot],bot,2023-04-15 13:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342407,zorro-bot[bot],bot,2023-04-15 14:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342408,zorro-bot[bot],bot,2023-04-15 15:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [197]:
# Function to evaluate the model and return metrics for each contributor
def evaluate_metrics(group_data):
    train_data = group_data[group_data['datetime'] < '2023-04-01 00:00:00'].reset_index(drop=True)
    test_data = group_data[group_data['datetime'] >= '2023-04-01 00:00:00'].reset_index(drop=True)

    # Separate features and target variable
    X_train = train_data.drop(['contributor', 'category', 'datetime', 'n_activities'], axis=1)
    y_train = train_data['n_activities']

    X_test = test_data.drop(['contributor', 'category', 'datetime', 'n_activities'], axis=1)
    y_test = test_data['n_activities']

    # Train the time series forecasting model with multiple linear regression
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)

    # Create a DataFrame for evaluation metrics and sum of activities
    evaluation_metrics_df = pd.DataFrame({
        'contributor': group_data['contributor'].iloc[0],  # Use the first value since it's the same for the group
        'category': group_data['category'].iloc[0],  # Use the first value since it's the same for the group
        'r2': r2,
        'mae': mae,
        'mse': mse,
        'n_activities': test_data['n_activities'].sum()
    })

    return evaluation_metrics_df


In [None]:
# Apply the function to each group and concatenate the results
results_df = laged_activities.groupby(['contributor', 'category']).apply(evaluate_metrics).reset_index(drop=True)

# Save the combined DataFrame to a CSV file
#results_df.to_csv('reg_model_metrics.csv', index=False)

In [None]:
# Split the data into training and test sets (last 20% of the data)
train_size = int(len(temp) * 0.8)
train_data, test_data = temp[:train_size], temp[train_size:]

# Separate features and target variable for training
X_train = train_data.drop(['date', 'n_activities'], axis=1)
y_train = train_data['n_activities']

# Separate features and target variable for testing
X_test = test_data.drop(['date', 'n_activities'], axis=1)
y_test = test_data['n_activities']

# Train the time series forecasting model with multiple linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

In [None]:
results = []

# train the time series forecasting model and calculate metrics
def train_and_evaluate(group):
    train_size = int(len(group) * 0.8)
    train_data, test_data = group[:train_size], group[train_size:]

    # Separate features and target variable for training
    X_train = train_data.drop(['date', 'n_activities'], axis=1)
    y_train = train_data['n_activities']

    # Separate features and target variable for testing
    X_test = test_data.drop(['date', 'n_activities'], axis=1)
    y_test = test_data['n_activities']

    # Train the time series forecasting model with multiple linear regression
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Calculate metrics
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)

    # Append results to the list
    results.append({
        'contributor': group['contributor'].iloc[0],  # Assuming 'contributor' is the contributor column
        'category': group['category'].iloc[0],  # Assuming 'category' is the category column
        'r2': r2,
        'mae': mae,
        'mse': mse,
        'n_activities': len(group),  # Number of activities in the group
    })

In [None]:

# Apply the function to each group
train_activities.groupby(['contributor', 'category']).apply(train_and_evaluate)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv('time_series_metrics.csv', index=False)