# Install packages

In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hUsing cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m[31m7.1 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: tzd

# Read file

In [2]:
import pandas as pd
import re

# Define a function to parse each line of the log file
def parse_log_line(line):
    # Regex to extract timestamp and bytes
    pattern = r'\[(.*?)\].*?(\d+)$'
    match = re.search(pattern, line)
    if match:
        timestamp = match.group(1)
        bytes_sent = match.group(2)
        return timestamp, int(bytes_sent)
    else:
        return None, None

In [3]:
# Let's first read the contents of the uploaded file to understand its structure and format.
file_path = './datasets/calgary_HTTP'

In [4]:
# Try reading the file with ISO-8859-1 encoding which is more permissive than UTF-8
data = []
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    for line in file:
        ts, bytes_sent = parse_log_line(line)
        if ts and bytes_sent is not None:
            data.append((ts, bytes_sent))

# Convert the list to a DataFrame
df = pd.DataFrame(data, columns=['Timestamp', 'Bytes'])

# Convert timestamp to datetime object for easier manipulation
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%b/%Y:%H:%M:%S %z', utc=True)

df.head()

Unnamed: 0,Timestamp,Bytes
0,1994-10-24 19:41:41+00:00,150
1,1994-10-24 19:41:41+00:00,1210
2,1994-10-24 19:43:13+00:00,3185
3,1994-10-24 19:43:14+00:00,2555
4,1994-10-24 19:43:15+00:00,36403


# 

In [9]:
def mean_absolute_error(actual, predicted):
    return sum(abs(a - p) for a, p in zip(actual, predicted)) / len(actual)

# Example usage with your actual and predicted data
actual = [100, 200, 300, 400, 500]  # Replace with your actual data points
predicted = [110, 190, 310, 390, 510]  # Replace with your predicted data points

mae = mean_absolute_error(actual, predicted)
print("Mean Absolute Error:", mae)


Mean Absolute Error: 10.0


Check for Stationarity
https://www.analyticsvidhya.com/blog/2021/08/vector-autoregressive-model-in-python/
https://gist.github.com/kevincdurand1/e8307dfb3e370ca15bdbb97300037c71

In [13]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting scipy!=1.9.2,>=1.8 (from statsmodels)
  Using cached scipy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading statsmodels-0.14.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached scipy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.2 MB)
Installing collected packages: scipy, patsy,

In [5]:
from statsmodels.tsa.stattools import adfuller

In [6]:
def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)
    for key,val in result[4].items():
        out[f'critical value ({key})']=val
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

In [None]:
adf_test(df['Bytes'])

Augmented Dickey-Fuller Test: 
