In this code note, I will present some time series techniques applied on activities sequences of human/bot users to predict number of activities to efficiently use Github api.

# Packages importing & visualisation

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)

In [None]:
activities = pd.read_parquet('../data-raw/activities.parquet')

In [None]:
activities_by_day = (
    activities
    .assign(date=pd.to_datetime(activities['date']).dt.date)
    # Grouping data by contributor and date, and counting up the activities by day
    .groupby(['contributor', 'date'])
    .activity
    .count()
    .reset_index(name='n_activities')
)
activities_by_day

In [None]:
temp = (
    activities_by_day[
        activities_by_day['contributor']
        .isin(activities_by_day
              .groupby('contributor')['n_activities']
              .median()
              .nlargest(3)
              .index
            )
    ]
)

# Plotting the time series
plt.figure(figsize=(18, 8))
sns.lineplot(x='date', y='n_activities', hue='contributor', data=temp)
plt.title('Time Series Plot of Activity Count for Top 3 Contributors')
plt.xlabel('Date')
plt.ylabel('# Activities')
plt.show()


# Multiple linear regression model

In [None]:
# Get the data of the top contributor 'codeclimate[bot]' to test time series forcasting method
temp = (
    activities_by_day[activities_by_day['contributor'] == 'codeclimate[bot]']
    .drop(['contributor'], axis=1)
    .reset_index(drop=True)
)

# Feature Engineering: create lag features based on the n previous dates
n_previous_dates = 13
for i in range(1, n_previous_dates+1):
    temp[f'n_activities_lag_{i}'] = temp['n_activities'].shift(i)

# Delete the n first rows to avoid NaN values
temp = temp.iloc[n_previous_dates:].reset_index(drop=True)

In [None]:
# Split the data into training and test sets (last 20% of the data)
train_size = int(len(temp) * 0.8)
train_data, test_data = temp[:train_size], temp[train_size:]

# Separate features and target variable for training
X_train = train_data.drop(['date', 'n_activities'], axis=1)
y_train = train_data['n_activities']

# Separate features and target variable for testing
X_test = test_data.drop(['date', 'n_activities'], axis=1)
y_test = test_data['n_activities']

# Train the time series forecasting model with multiple linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

In [None]:
# Calculate Mean Squared Error
print(f'R2 Score: {r2_score(y_test, predictions)}')
print(f'Explained variance score: {explained_variance_score(y_test, predictions)}')

In [None]:
max_r2 = 0
max_evs = 0


for pp in range(1, 100):
    temp = (
        activities_by_day[activities_by_day['contributor'] == 'codeclimate[bot]']
        .drop(['contributor'], axis=1)
        .reset_index(drop=True)
    )

    # Feature Engineering: create lag features based on the n previous dates
    n_previous_dates = pp
    for i in range(1, n_previous_dates+1):
        temp[f'n_activities_lag_{i}'] = temp['n_activities'].shift(i)

    # Delete the n first rows to avoid NaN values
    temp = temp.iloc[n_previous_dates:].reset_index(drop=True)

    # Split the data into training and test sets (last 20% of the data)
    train_size = int(len(temp) * 0.8)
    train_data, test_data = temp[:train_size], temp[train_size:]

    # Separate features and target variable for training
    X_train = train_data.drop(['date', 'n_activities'], axis=1)
    y_train = train_data['n_activities']

    # Separate features and target variable for testing
    X_test = test_data.drop(['date', 'n_activities'], axis=1)
    y_test = test_data['n_activities']

    # Train the time series forecasting model with multiple linear regression
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    if(max_r2 < r2_score(y_test, predictions)):
        max_r2 = r2_score(y_test, predictions)
        i_evs = n_previous_dates
        print(f'[R2] New score found with n = {n_previous_dates}, R2 Score: {r2_score(y_test, predictions)}, Explained variance score: {explained_variance_score(y_test, predictions)}')

    if(max_evs < explained_variance_score(y_test, predictions)):
        max_evs = explained_variance_score(y_test, predictions)
        i_evs = n_previous_dates
        print(f'[EVS] New score found with n = {n_previous_dates}, R2 Score: {r2_score(y_test, predictions)}, Explained variance score: {explained_variance_score(y_test, predictions)}')

In [None]:
max_r2, max_evs

In [None]:
# Visualize Results
plt.figure(figsize=(18, 10))
sns.lineplot(x=temp['date'], y=temp['n_activities'], label='Real Values')
sns.lineplot(x=test_data['date'], y=predictions, label='Predicted Values')
plt.title('Time Series Forecasting - Real vs Predicted Values')
plt.xlabel('Date')
plt.ylabel('Number of Activities')
plt.xticks(rotation=45)
plt.legend()
plt.show()