# **Generate hypothetical YouTube channel growth metrics**

Chanin Nantasenamat, Ph.D.

[*Data Professor YouTube channel*](https://youtube.com/dataprofessor)

In [18]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Generate dates for 2 years
start_date = datetime(2019, 8, 19)
end_date = datetime(2024, 9, 15)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Initialize data with zeros
n_days = len(date_range)
data = {
    'DATE': date_range,
    'SUBSCRIBERS_GAINED': np.zeros(n_days, dtype=int),
    'SUBSCRIBERS_LOST': np.zeros(n_days, dtype=int),
    'VIEWS': np.zeros(n_days, dtype=int),
    'WATCH_HOURS': np.zeros(n_days, dtype=int),
    'LIKES': np.zeros(n_days, dtype=int),
    'SHARES': np.zeros(n_days, dtype=int),
    'COMMENTS': np.zeros(n_days, dtype=int)
}

# Create DataFrame
df = pd.DataFrame(data)

# Function to generate growth
def generate_growth(start, end, days):
    return np.linspace(start, end, days)

# Generate growth patterns
subscribers_gained = generate_growth(1, 200, n_days)
subscribers_lost = generate_growth(0, 50, n_days)
views = generate_growth(10, 10000, n_days)
watch_hours = generate_growth(1, 1000, n_days)
likes = generate_growth(0, 500, n_days)
shares = generate_growth(0, 100, n_days)
comments = generate_growth(0, 50, n_days)

# Add randomness and ensure integer values
for i, col in enumerate(['SUBSCRIBERS_GAINED', 'SUBSCRIBERS_LOST', 'VIEWS', 'WATCH_HOURS', 'LIKES', 'SHARES', 'COMMENTS']):
    random_factor = np.random.normal(1, 0.1, n_days)  # Mean of 1, standard deviation of 0.1
    df[col] = np.maximum(0, (eval(col.lower()) * random_factor).astype(int))

# Weekend boost
weekend_mask = (df['DATE'].dt.dayofweek >= 5)
df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] = df.loc[weekend_mask, ['VIEWS', 'WATCH_HOURS', 'LIKES']] * 1.5

# Seasonal variation (higher in summer)
days_in_year = 366  # Account for leap year
summer_boost = np.sin(np.linspace(0, 2*np.pi, days_in_year))
df['VIEWS'] = df['VIEWS'] * (1 + 0.2 * summer_boost[df['DATE'].dt.dayofyear - 1])

# Occasional viral videos (once every 2 months on average, starting from the second month)
viral_days = np.random.choice(range(30, n_days), size=11, replace=False)
df.loc[viral_days, ['VIEWS', 'LIKES', 'SHARES', 'COMMENTS']] = df.loc[viral_days, ['VIEWS', 'LIKES', 'SHARES', 'COMMENTS']] * 5

# Ensure integer values
for col in df.columns:
    if col != 'DATE':
        df[col] = df[col].astype(int)

# Calculate cumulative subscribers
df['TOTAL_SUBSCRIBERS'] = (df['SUBSCRIBERS_GAINED'] - df['SUBSCRIBERS_LOST']).cumsum()

# Ensure no negative values
df[df.select_dtypes(include=[np.number]).columns] = df.select_dtypes(include=[np.number]).clip(lower=0)

# Save to CSV
df.to_csv('youtube_channel_data.csv', index=False)

   259.5   318.    357.    475.5   414.    460.5   459.    481.5   475.5
   447.    567.    586.5   648.    694.5   669.    658.5   639.    793.5
   640.5   855.    747.    867.    936.   1006.5  1083.    918.    984.
   997.5  1186.5  1113.    975.   1168.5  1027.5  1242.   1224.   1429.5
  1125.   1485.   1396.5  1287.   1411.5  1570.5  1563.   1633.5  1599.
  1837.5  1539.   1674.   1813.5  1857.   1693.5  1693.5  1929.   1969.5
  1954.5  1720.5  1944.   2025.   2013.   2091.   1870.5  1885.5  2430.
  1696.5  2523.   2224.5  2197.5  2566.5  2910.   2697.   2094.   2325.
  1842.   2379.   2494.5  2584.5  2445.   2368.5  2670.   2743.5  2379.
  2824.5  2239.5  2598.   2659.5  2967.   2608.5  2820.   2907.   3099.
  2767.5  2701.5  2946.   2488.5  2905.5  2905.5  3195.   3316.5  3085.5
  3178.5  3502.5  2691.   2926.5  3660.   3166.5  3814.5  3165.   3291.
  3664.5  3144.   3964.5  3315.   3438.   3639.   3624.   3649.5  3549.
  3810.   3454.5  3000.   3525.   3156.   3679.5  3904.5  4

In [19]:
# Display DataFrame
df

Unnamed: 0,DATE,SUBSCRIBERS_GAINED,SUBSCRIBERS_LOST,VIEWS,WATCH_HOURS,LIKES,SHARES,COMMENTS,TOTAL_SUBSCRIBERS
0,2019-08-19,1,0,6,1,0,0,0,1
1,2019-08-20,1,0,11,1,0,0,0,2
2,2019-08-21,1,0,17,2,0,0,0,3
3,2019-08-22,1,0,16,2,0,0,0,4
4,2019-08-23,1,0,28,2,0,0,0,5
...,...,...,...,...,...,...,...,...,...
1850,2024-09-11,186,51,7250,821,573,108,43,140393
1851,2024-09-12,235,47,8068,1076,563,89,49,140581
1852,2024-09-13,175,56,7578,907,449,89,45,140700
1853,2024-09-14,218,52,13022,1495,826,92,53,140866
