# Load the Data

In [None]:
import pandas as pd
import zipfile
import os

# Function to load CSV from a ZIP file with multiple files
def load_csv_from_zip(zip_path, csv_filename):
    with zipfile.ZipFile(zip_path, 'r') as z:
        # Extract and read the specific CSV file
        with z.open(csv_filename) as f:
            return pd.read_csv(f)

# Define the relative path to the datasets folder
datasets_path = os.path.join('..', 'Datasets')

# Load datasets from zipped CSV files specifying the correct CSV filenames
df_gb = load_csv_from_zip(os.path.join(datasets_path, 'GBvideos.csv.zip'), 'GBvideos.csv')
df_us = load_csv_from_zip(os.path.join(datasets_path, 'USvideos.csv.zip'), 'USvideos.csv')

# Add a new column 'location' in each data file
df_gb['location'] = 'Great Britain'
df_us['location'] = 'USA'

# Merge 5 files into 1
merged_df = pd.concat([df_gb, df_us], ignore_index=True)

# Check the first few rows of the merged DataFrame
print(merged_df.head())

# Check Missing Values

In [None]:
# Check for missing values in the merged DataFrame
print("Missing values")
print(merged_df.isnull().sum())

In [None]:
df = merged_df.dropna()

In [None]:
# Check for missing values in the merged DataFrame
print("Missing values")
print(df.isnull().sum())

# Drop Unnecessary Columns

In [None]:
#drop columns needed
merged_df.drop(columns=['thumbnail_link', 'video_id','comments_disabled','ratings_disabled','video_error_or_removed'], inplace=True)

print(merged_df.head())

# Text Preprocessing 

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

# Get the list of default English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords and clean text
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove non-alphabetical characters (retain only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Split text into words
    words = text.split()
    
    # Remove stopwords
    remove_stopwords = [word for word in words if word not in stop_words]
    
    # Join the cleaned words back into a string
    new_text = ' '.join(remove_stopwords)
    
    return new_text
    data = {'title','description','text'}
    
# Apply the clean_text function to the 'title' column in merged_df1
merged_df['new_text'] = merged_df['title'].apply(clean_text)

# Display the cleaned DataFrame
print(merged_df)

In [None]:
# Check the data types of each column
print(merged_df.dtypes)

# Split the Dataset into Train and Test by 80/20

In [None]:
from sklearn.model_selection import train_test_split

X = merged_df.drop(columns=['views'])  # Drop 'views' from features to get X
y = merged_df['views']
# Assuming you have a dataset with features X and target y
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

train = pd.DataFrame(X_train)
train['views'] = y_train.values

test = pd.DataFrame(X_test)
test['views'] = y_test.values

# (New) Feature Engineering

* Remove irrelavent features(Time-Based features, `Days Since Published`)
* Create `Basic Engagement Ratio Analysis`
* Create `Time Based Metrics Analysis`

### Basic Engagement Ratio Analysis

In [None]:
import pandas as pd

# Convert trending_date and publish_time to datetime
# Convert trending_date and publish_time to timezone-naive datetime
def prepare_datetime_columns(train):
    train['trending_date'] = pd.to_datetime(train['trending_date'], format='%y.%d.%m', errors='coerce')
    train['publish_time'] = pd.to_datetime(train['publish_time'], errors='coerce')
    
    # Remove timezone information to make them timezone-naive
    train['trending_date'] = train['trending_date'].dt.tz_localize(None)
    train['publish_time'] = train['publish_time'].dt.tz_localize(None)
    
    return train


# Basic Engagement Ratios
def create_engagement_ratios(train):
    epsilon = 1e-10  # Prevents division by zero
    
    train['like_view_ratio'] = train['likes'] / (train['views'] + epsilon)
    train['comment_like_ratio'] = train['comment_count'] / (train['likes'] + epsilon)
    train['dislike_view_ratio'] = train['dislikes'] / (train['views'] + epsilon)
    train['comment_view_ratio'] = train['comment_count'] / (train['views'] + epsilon)
    train['total_engagement_ratio'] = (train['likes'] + train['dislikes'] + train['comment_count']) / (train['views'] + epsilon)
    train['like_dislike_ratio'] = train['likes'] / (train['dislikes'] + epsilon)
    
    # Normalized Engagement Scores
    train['normalized_likes'] = (train['likes'] - train['likes'].mean()) / train['likes'].std()
    train['normalized_views'] = (train['views'] - train['views'].mean()) / train['views'].std()
    
    # Category-specific engagement ratios
    train['category_like_view_ratio'] = train.groupby('category_id')['like_view_ratio'].transform('mean')
    train['relative_category_engagement'] = train['like_view_ratio'] / (train['category_like_view_ratio'] + epsilon)
    
    # Engagement rate percentiles
    train['like_view_percentile'] = train['like_view_ratio'].rank(pct=True)
    train['comment_like_percentile'] = train['comment_like_ratio'].rank(pct=True)
    
    return train

# Create engagement level categories
def create_engagement_categories(train):
    train['like_view_category'] = pd.qcut(train['like_view_ratio'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    train['comment_like_category'] = pd.qcut(train['comment_like_ratio'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    
    # Combined Engagement Score
    train['engagement_score'] = (train['like_view_percentile'] + train['comment_like_percentile']) / 2
    train['engagement_category'] = pd.qcut(train['engagement_score'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    
    return train

# Time-based engagement metrics
def create_time_based_engagement(train):
    epsilon = 1e-10
    train['hours_to_trend'] = (train['trending_date'] - train['publish_time']).dt.total_seconds() / 3600
    
    return train


# Putting it all together
def create_all_engagement_features(train):
    train = prepare_datetime_columns(train)
    train = create_engagement_ratios(train)
    train = create_engagement_categories(train)
    train = create_time_based_engagement(train)
    return train

In [None]:
train = create_all_engagement_features(train)
print(train)

### Time-Based Metrics Analysis

In [None]:
train['publish_weekday'] = train['publish_time'].dt.dayofweek
train['is_weekend'] = train['publish_weekday'].isin([5,6]).astype(int)

print(train)

## TF-IDF Feature 

#### `description` Column

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
 
# Ensure the 'description' column exists in the DataFrame
if 'description' in train.columns:
    # Assuming 'description' column contains the text data
    text_data = train['description'].fillna('')  # Handle missing values
    
    # Check if text_data is iterable, not a single string
    if isinstance(text_data, pd.Series):
        # Initialize the TF-IDF Vectorizer
        tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        
        # Fit and transform the text data to generate the TF-IDF matrix
        tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)
        
        # Convert the sparse matrix into a DataFrame for easier manipulation
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
        
        # Function to get top N features per row based on TF-IDF score
        def get_top_tfidf_features(row, features, top_n=5):
            top_indices = np.argsort(row)[::-1][:top_n]  # Get the indices of the top n features
            top_features = [(features[i], row[i]) for i in top_indices]  # Get feature names and scores
            return top_features

        # Apply the function to each row in the TF-IDF matrix
        top_tfidf_features = [get_top_tfidf_features(row, tfidf_vectorizer.get_feature_names_out(), top_n=5) 
                              for row in tfidf_matrix.toarray()]
        
        # Add the top TF-IDF features as a new column in the original DataFrame
        train['top_tfidf_features'] = top_tfidf_features
        
        # Display the entire first 5 rows of the DataFrame including the top TF-IDF features
        print(train.head(5))
    else:
        print("The 'description' column should be a pandas Series.")
else:
    print("The DataFrame does not contain a 'description' column.")

#### `tags` Column

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
 
# Ensure the 'description' column exists in the DataFrame
if 'tags' in train.columns:
    # Assuming 'description' column contains the text data
    text_data = train['tags'].fillna('')  # Handle missing values
    
    # Check if text_data is iterable, not a single string
    if isinstance(text_data, pd.Series):
        # Initialize the TF-IDF Vectorizer
        tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        
        # Fit and transform the text data to generate the TF-IDF matrix
        tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)
        
        # Convert the sparse matrix into a DataFrame for easier manipulation
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
        
        # Function to get top N features per row based on TF-IDF score
        def get_top_tfidf_features(row, features, top_n=5):
            top_indices = np.argsort(row)[::-1][:top_n]  # Get the indices of the top n features
            top_features = [(features[i], row[i]) for i in top_indices]  # Get feature names and scores
            return top_features

        # Apply the function to each row in the TF-IDF matrix
        top_tfidf_features = [get_top_tfidf_features(row, tfidf_vectorizer.get_feature_names_out(), top_n=5) 
                              for row in tfidf_matrix.toarray()]
        
        # Add the top TF-IDF features as a new column in the original DataFrame
        train['top_tfidf_features'] = top_tfidf_features
        
        # Display the entire first 5 rows of the DataFrame including the top TF-IDF features
        print(train.head(5))
    else:
        print("The 'description' column should be a pandas Series.")
else:
    print("The DataFrame does not contain a 'description' column.")

# Dimension Reduction-PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

non_numeric_cols = ['publish_time', 'title', 'channel_title', 'tags', 'description', 'location', 'trending_date']
X_train_model = train.drop(columns=non_numeric_cols + ['views']).select_dtypes(include=[float, int])
X_test_model = test.drop(columns=non_numeric_cols + ['views']).select_dtypes(include=[float, int])

X_test_model = X_test_model.reindex(columns=X_train_model.columns, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_model)
X_test_scaled = scaler.transform(X_test_model)

print("Missing values in X_train_model:\n", X_train_model.isna().sum())
print("Missing values in X_test_model:\n", X_test_model.isna().sum())


In [None]:
import matplotlib.pyplot as plt
# Apply PCA (Reduce to n components to capture 95% of variance)
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Visualize the PCA results (Plot only the first two components)
plt.figure(figsize=(10, 6))
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap='viridis', edgecolor='k', s=50)
plt.colorbar(label='Views')
plt.title('PCA of YouTube Data (Train Set)', weight='bold')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

# Explained variance for all components selected by PCA
explained_variance = pca.explained_variance_ratio_
print("Explained Variance per component:")
for i, variance in enumerate(explained_variance, start=1):
     print(f"PC{i}: {variance:.2%}")


# Model Refitting

## Winning Model-XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

In [None]:
# Define a function for calculating model metrics
def calculate_metrics(model, X_train, y_train, X_test, y_test):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    # Calculate RMSE and R^2 for training and test sets
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    train_r2 = r2_score(y_train, train_preds)
    test_r2 = r2_score(y_test, test_preds)

    return {
        "Train RMSE": train_rmse, "Test RMSE": val_rmse,
        "Train R^2": train_r2, "Test R^2": val_r2
    }

In [None]:
# Define a function to train the model with specific hyperparameters
def train_xgboost(X_train, y_train, X_test, y_test, params):
    model = xgb.XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train)

    # Calculate and return metrics
    metrics = calculate_metrics(model, X_train, y_train, X_test, y_test)
    return model, metrics


In [None]:
# Define hyperparameter variations
variations = [
    {"learning_rate": 0.05, "n_estimators": 200, "max_depth": 6}# Winning variation
]

In [None]:
# Initialize a DataFrame to store results for each variation
results = pd.DataFrame(columns=["Variation", "Train RMSE", "Test RMSE", "Train R^2", "Test R^2"])


In [None]:
# List of columns to drop if they exist
text_columns = ['title', 'channel_title', 'tags', 'description', 'location', 'new_text']
X_train.drop([col for col in text_columns if col in X_train.columns], axis=1, inplace=True)
X_test.drop([col for col in text_columns if col in X_test.columns], axis=1, inplace=True)

# Encode categorical features using one-hot encoding for consistency
categorical_columns = ['engagement_category', 'comment_like_category', 'like-view-category']
X_train = pd.get_dummies(X_train, columns=[col for col in categorical_columns if col in X_train.columns], drop_first=True)
X_test = pd.get_dummies(X_test, columns=[col for col in categorical_columns if col in X_test.columns], drop_first=True)

# Drop 'trending_date' and 'publish_time' if they exist
for date_col in ['trending_date', 'publish_time']:
    if date_col in X_train.columns:
        X_train.drop([date_col], axis=1, inplace=True)
    if date_col in X_test.columns:
        X_test.drop([date_col], axis=1, inplace=True)

# Add any missing features from the provided feature list, excluding already existing columns
required_features = [
    'category_id', 'likes', 'dislikes', 'comment_count', 'like_view_ratio', 'comment_like_ratio', 
    'dislike_view_ratio', 'comment_view_ratio', 'total_engagement_ratio', 'like_dislike_ratio', 
    'normalized_likes', 'normalized_views', 'category_like_view_ratio', 'relative_category_engagement', 
    'like_view_percentile', 'comment_like_percentile', 'engagement_score', 'hours_to_trend', 
    'publish_weekday', 'is_weekend'
]
# Add missing features with default values of 0
for feature in required_features:
    if feature not in X_train.columns:
        X_train[feature] = 0
    if feature not in X_test.columns:
        X_test[feature] = 0

# Ensure X_test has the same columns as X_train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Verify that X_train and X_test now have the same columns
print("X_train columns:", X_train.columns)
print("X_test columns:", X_test.columns)

In [None]:
def calculate_metrics(model, X_train, y_train, X_test, y_test):
    # Predictions
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    
    train_r2 = r2_score(y_train, train_preds)
    test_r2 = r2_score(y_test, test_preds)

    return {
        "Train RMSE": train_rmse,
        "Test RMSE": test_rmse,  # Changed from val_rmse to test_rmse
        "Train R^2": train_r2,
        "Test R^2": test_r2
    }

In [None]:
# Create an empty DataFrame if it isn't already
results = pd.DataFrame()

# Train models for each variation and record results
for i, params in enumerate(variations):
    model, metrics = train_xgboost(X_train, y_train, X_test, y_test, params)
    
    # Create a DataFrame with the metrics for this variation
    result_row = pd.DataFrame({
        "Variation": [f"Variation {i + 1}"],
        **metrics
    })
    
    # Concatenate the new row to the results DataFrame
    results = pd.concat([results, result_row], ignore_index=True)

In [None]:
# Display the comparison table
print("XGBoost Model with Previous Winning Variation:")
print(results)