# YouTube Statistics Analysis & Prediction

This notebook performs exploratory data analysis (EDA), feature engineering, and machine learning modeling on Global YouTube Statistics.

In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# --- Machine Learning ---
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score

# --- Settings ---
warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid", palette="viridis")
plt.style.use('dark_background') # Maintaining the visual style if preferred, or standard

In [None]:
def load_and_clean_data(filepath):
    """
    Loads the dataset, handles encoding, and performs initial cleaning.
    """
    try:
        df = pd.read_csv(filepath, encoding='utf-8')
    except UnicodeDecodeError:
        print("UTF-8 encoding failed, using latin-1.")
        df = pd.read_csv(filepath, encoding='latin-1')
    except FileNotFoundError:
        raise FileNotFoundError(f"File named '{filepath}' not found. Please ensure it is in the same directory.")

    # Drop unnecessary columns
    columns_to_drop = [
        'rank', 'Abbreviation', 'country_rank', 'created_month',
        'created_date', 'Gross tertiary education enrollment (%)',
        'Unemployment rate', 'Urban_population', 'Latitude', 'Longitude'
    ]
    # Only drop if they exist
    df = df.drop(columns=[c for c in columns_to_drop if c in df.columns], errors='ignore')

    # Handle Missing Values
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna("Unknown")
    
    for col in df.select_dtypes(include=['int64', 'float']).columns:
        df[col] = df[col].fillna(df[col].median())

    # Correct Data Types
    for col in ['video views', 'uploads', 'subscribers']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype('int64')

    # Filter illogical data
    if 'video views' in df.columns and 'created_year' in df.columns:
        df = df[(df['video views'] > 0) & (df['created_year'] >= 2005)]

    # Sort and reset index
    if 'subscribers' in df.columns:
        df = df.sort_values(by='subscribers', ascending=False).reset_index(drop=True)
    
    print(f"Data cleaning complete. Shape: {df.shape}")
    return df


In [None]:
def engineer_features(df):
    """
    Creates new features for analysis and modeling.
    """
    df = df.copy()
    current_year = datetime.datetime.now().year
    
    if 'created_year' in df.columns:
        df['channel_age'] = current_year - df['created_year']
        df['channel_age'] = df['channel_age'].apply(lambda x: x if x >= 0 else 0)

    if 'video views' in df.columns and 'subscribers' in df.columns:
        df['views_per_subscriber'] = df['video views'] / df['subscribers']
        df['views_per_subscriber'] = df['views_per_subscriber'].fillna(0)

    if 'uploads' in df.columns and 'channel_age' in df.columns:
        df['uploads_per_day'] = df['uploads'] / (df['channel_age'] * 365.25)
        df['uploads_per_day'] = df['uploads_per_day'].fillna(0).replace([np.inf, -np.inf], 0)

    if 'subscribers' in df.columns:
        def get_tier(subs):
            if subs >= 1e8: return 'Mega'
            elif subs >= 1e7: return 'Macro'
            elif subs >= 1e6: return 'Mid'
            elif subs >= 1e5: return 'Micro'
            else: return 'Nano'
        df['subscriber_tier'] = df['subscribers'].apply(get_tier)
    
    print("Feature engineering complete.")
    return df

In [None]:
def remove_outliers_iqr(df, columns):
    """
    Removes outliers using the IQR method for specified columns.
    """
    df_clean = df.copy()
    for col in columns:
        if col in df_clean.columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    
    print(f"Rows after outlier removal: {df_clean.shape[0]} (Original: {df.shape[0]})")
    return df_clean

In [None]:
def build_preprocessor(categorical_cols, numerical_cols):
    """
    Creates a ColumnTransformer for preprocessing.
    """
    return ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])


In [None]:
def perform_eda(df):
    """
    Generates validation plots.
    """
    # 1. Distribution of Channel Types
    if 'channel_type' in df.columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(y='channel_type', data=df, order=df['channel_type'].value_counts().index, palette='viridis')
        plt.title('Distribution of Channel Types')
        plt.show()

    # 2. Correlation Matrix
    num_df = df.select_dtypes(include=[np.number])
    if not num_df.empty:
        plt.figure(figsize=(12, 10))
        sns.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
        plt.title('Correlation Heatmap')
        plt.show()


In [None]:
# --- Main Execution ---

# 1. Load Data
filename = 'Global YouTube Statistics.csv'
try:
    df = load_and_clean_data(filename)
    
    # 2. EDA
    perform_eda(df)

    # 3. Feature Engineering
    df = engineer_features(df)

    # 4. Outlier Removal
    numerical_cols_for_outliers = [
        'subscribers', 'video views', 'uploads', 'channel_age',
        'views_per_subscriber', 'uploads_per_day'
    ]
    df_ml = remove_outliers_iqr(df, numerical_cols_for_outliers)

    # 5. Prepare ML Data
    target = 'video views'
    if target in df_ml.columns:
        X = df_ml.drop(['Title', 'Youtuber', target], axis=1, errors='ignore')
        y = df_ml[target]

        # Identify columns again after drops
        categorical_features = X.select_dtypes(include=['object']).columns.tolist()
        numerical_features = X.select_dtypes(include=np.number).columns.tolist()

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        preprocessor = build_preprocessor(categorical_features, numerical_features)

        # 6. Train Linear Regression
        print("Training Linear Regression...")
        lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('regressor', LinearRegression())])
        lr_pipeline.fit(X_train, y_train)
        y_pred_lr = lr_pipeline.predict(X_test)
        print(f"Linear Regression R2: {r2_score(y_test, y_pred_lr):.4f}")

        # 7. Train Random Forest
        print("Training Random Forest...")
        rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])
        rf_pipeline.fit(X_train, y_train)
        y_pred_rf = rf_pipeline.predict(X_test)
        print(f"Random Forest R2: {r2_score(y_test, y_pred_rf):.4f}")
    else:
        print(f"Target column '{target}' not found.")

except Exception as e:
    print(f"An error occurred during execution: {e}")