In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

np.random.seed(26)

In [6]:
# Define date range (5 years of daily data)
start_date = datetime(2019, 1, 1)
end_date = datetime(2024, 12, 31)
date_series = pd.date_range(start=start_date, end=end_date, freq='D')

In [9]:
# Initialize dataset with zero values
num_days = len(date_series)
data = {
    'DATE': date_series,
    'SUBSCRIBERS_GAINED': np.zeros(num_days, dtype=int),
    'SUBSCRIBERS_LOST': np.zeros(num_days, dtype=int),
    'VIEWS': np.zeros(num_days, dtype=int),
    'WATCH_HOURS': np.zeros(num_days, dtype=int),
    'LIKES': np.zeros(num_days, dtype=int),
    'SHARES': np.zeros(num_days, dtype=int),
    'COMMENTS': np.zeros(num_days, dtype=int)
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,DATE,SUBSCRIBERS_GAINED,SUBSCRIBERS_LOST,VIEWS,WATCH_HOURS,LIKES,SHARES,COMMENTS
0,2019-01-01,0,0,0,0,0,0,0
1,2019-01-02,0,0,0,0,0,0,0
2,2019-01-03,0,0,0,0,0,0,0
3,2019-01-04,0,0,0,0,0,0,0
4,2019-01-05,0,0,0,0,0,0,0


In [10]:
# Function to generate linear growth trend
def generate_linear_growth(start_value, end_value, num_days):
    return np.linspace(start_value, end_value, num_days)

# This function creates a smooth, linear growth trend from start_value to end_value over num_days.
# It generates an array of numbers where each value is evenly spaced between the start and end values.
# generate_linear_growth(start_value=100, end_value=1000, num_days=10) => [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

In [11]:
# Simulate growth patterns for different metrics
subscribers_gained = generate_linear_growth(1, 200, num_days)
subscribers_lost = generate_linear_growth(0, 50, num_days)
views = generate_linear_growth(10, 10000, num_days)
watch_hours = generate_linear_growth(1, 1000, num_days)
likes = generate_linear_growth(0, 500, num_days)
shares = generate_linear_growth(0, 100, num_days)
comments = generate_linear_growth(0, 50, num_days)

In [12]:
# Apply randomness and ensure integer values
metrics = ['SUBSCRIBERS_GAINED', 'SUBSCRIBERS_LOST', 'VIEWS', 'WATCH_HOURS', 'LIKES', 'SHARES', 'COMMENTS']
for metric, growth_trend in zip(metrics, [subscribers_gained, subscribers_lost, views, watch_hours, likes, shares, comments]):
    noise_factor = np.random.normal(1, 0.1, num_days)  # Introduce variation
    df[metric] = np.maximum(0, (growth_trend * noise_factor).astype(int))  # Ensure values remain non-negative

In [14]:
# Apply weekend boost (increase views, watch hours, and likes on weekends)
is_weekend = df['DATE'].dt.dayofweek >= 5  # Saturday (5) and Sunday (6)
df.loc[is_weekend, ['VIEWS', 'WATCH_HOURS', 'LIKES']] = (
    df.loc[is_weekend, ['VIEWS', 'WATCH_HOURS', 'LIKES']] * 1.5
).astype(int)

In [None]:
# Introduce seasonal variation (higher activity in summer)
days_in_year = 366  # Account for leap years
seasonal_boost = np.sin(np.linspace(0, 2 * np.pi, days_in_year))  # Simulates annual trends
df['VIEWS'] *= (1 + 0.2 * seasonal_boost[df['DATE'].dt.dayofyear - 1])