In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Instagram Like Count Prediction CS 412 Project

## Problem Overview
The task involves predicting the number of likes an Instagram post will receive based on various features from both the post itself and the user's profile. This is structured as a regression problem where we predict a continuous numerical value (like count) for each post.

## Data Processing Pipeline

### 1. Feature Engineering
We engineered several features from the raw data:

#### Post-Level Features:
- Comment count (log-transformed)
- Caption length
- Emoji count in caption
- Hashtag count in caption
- Temporal features (year, month, day, hour) from timestamp

#### Profile-Level Features:
- Follower count (log-transformed)
- Following count (log-transformed)
- Highlight reel count
- Profile completeness metrics

#### Engagement Metrics:
- Engagement rate: (likes + comments) / followers
- Comment-to-follower ratio
- Following-to-follower ratio

### 2. Data Preprocessing
- Handled missing values with appropriate defaults
- Applied log transformation to handle skewed distributions in metrics like follower count and comment count
- Scaled numerical features using StandardScaler
- Extracted and processed text features from captions and bio
- Cleaned and normalized text data

### 3. Model Development

#### Model Selection
We experimented with two powerful tree-based models:
1. Random Forest Regressor
   - Handles non-linear relationships
   - Good with both numerical and categorical features
   - Less prone to overfitting

2. XGBoost Regressor
   - Gradient boosting implementation
   - Known for high performance in various tasks
   - Efficient handling of sparse data

#### Training Approach
- Split data into training (80%) and validation (20%) sets
- Trained both models with default parameters
- Evaluated using Mean Squared Error (MSE) on validation set
- Selected the best performing model for final predictions

### 4. Prediction Pipeline
1. Load and preprocess test data using the same pipeline as training
2. Apply feature engineering steps
3. Scale features using the fitted scaler
4. Generate predictions using the best model
5. Transform predictions back to original scale
6. Round predictions to integers
7. Save results in required format


## Potential Improvements
1. Feature engineering:
   - Add more sophisticated text analysis
   - Include image-based features if available
   - Consider user engagement history

2. Model enhancements:
   - Fine-tune hyperparameters with more number of trials
   - Experiment with ensemble methods
   - Try deep learning approaches

3. Data processing:
   - Implement more sophisticated handling of outliers
   - Add feature selection methods
   - Consider time-based cross-validation

## Conclusion
Our approach focused on creating robust features and using reliable tree-based models for prediction. The pipeline is designed to be efficient and maintainable, with clear separation of concerns between data processing, model training, and prediction generation.

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [26]:
import json
import pandas as pd
import numpy as np
from datetime import datetime
import emoji
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import optuna

# Utility functions
def extract_emojis_and_tags(text):
    if not text:
        return "", "", []
    emojis = ''.join(char for char in text if char in emoji.EMOJI_DATA)
    hashtags = re.findall(r"#\w+", text)
    clean_text = re.sub(r"#\w+", "", text)
    clean_text = ''.join(char for char in clean_text if char not in emoji.EMOJI_DATA)
    return clean_text.strip(), emojis, hashtags

def process_timestamp(timestamp):
    try:
        dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
        return dt.year, dt.month, dt.day, dt.hour
    except (ValueError, TypeError):
        return None, None, None, None

def to_log_scale(value):
    try:
        value = float(value if value is not None else 0)
        return np.log10(1 + value) if value > 0 else 0
    except (ValueError, TypeError):
        return 0

def inverse_log_transform(y_pred):
    return (10 ** y_pred) - 1

def safe_get(dictionary, key, default=0):
    """Safely get a value from a dictionary, handling None values"""
    value = dictionary.get(key, default)
    return value if value is not None else default

class InstagramEngagementPredictor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.best_model = None

    def extract_features(self, record, is_training=True):
        features = []
        labels = []
        post_ids = []

        posts = record.get('posts', []) if is_training else [record]
        profile = record.get('profile', {}) if record.get('profile') is not None else {}

        for post in posts:
            # Get basic counts with safe handling of None values
            like_count = safe_get(post, 'like_count', 0)
            comment_count = safe_get(post, 'comments_count', 0)
            follower_count = safe_get(profile, 'follower_count', 1)  # Use 1 as default to avoid division by zero
            following_count = safe_get(profile, 'following_count', 0)

            # Process timestamp
            year, month, day, hour = process_timestamp(safe_get(post, 'timestamp'))

            # Process caption
            caption = safe_get(post, 'caption', '')
            clean_caption, emojis, hashtags = extract_emojis_and_tags(caption)

            # Calculate engagement metrics
            engagement_rate = (float(like_count) + float(comment_count)) / float(follower_count)
            comment_to_follower_ratio = float(comment_count) / float(follower_count)
            following_to_follower_ratio = float(following_count) / float(follower_count)

            feature_dict = {
                'engagement_rate': engagement_rate,
                'follower_count': to_log_scale(follower_count),
                'comments_count': to_log_scale(comment_count),
                'caption_length': len(clean_caption),
                'emoji_count': len(emojis),
                'hashtag_count': len(hashtags),
                'following_count': to_log_scale(following_count),
                'highlight_reel_count': safe_get(profile, 'highlight_reel_count', 0),
                'day': day if day is not None else 1,
                'hour': hour if hour is not None else 0,
                'comment_to_follower_ratio': comment_to_follower_ratio,
                'following_to_follower_ratio': following_to_follower_ratio
            }

            features.append(feature_dict)
            if is_training:
                labels.append(to_log_scale(like_count))
            post_ids.append(safe_get(post, 'id'))

        return features, labels, post_ids

    def load_data(self, file_path, is_training=True):
        features_list = []
        labels_list = []
        all_post_ids = []

        with open(file_path, 'r') as f:
            for line in f:
                record = json.loads(line)
                features, labels, post_ids = self.extract_features(record, is_training)
                features_list.extend(features)
                if is_training:
                    labels_list.extend(labels)
                all_post_ids.extend(post_ids)

        X = pd.DataFrame(features_list)
        X = X.fillna(0)  # Fill any remaining NaN values
        if is_training:
            return X, np.array(labels_list)
        return X, all_post_ids

    def train_and_evaluate(self, training_file, n_trials=5):
        # Load and prepare training data
        print("Loading training data...")
        X, y = self.load_data(training_file)

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

        # Split data
        X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        # Define model configurations
        models = {
            'RandomForest': {
                'model': RandomForestRegressor,
                'params': {
                    'n_estimators': (200, 600),
                    'max_depth': (5, 20),
                    'min_samples_split': (2, 8),
                    'min_samples_leaf': (1, 4)
                }
            },
            'XGBoost': {
                'model': XGBRegressor,
                'params': {
                    'n_estimators': (200, 600),
                    'max_depth': (4, 10),
                    'learning_rate': (0.01, 0.2),
                    'subsample': (0.7, 1.0),
                    'colsample_bytree': (0.7, 1.0),
                    'min_child_weight': (1, 7)
                }
            }
        }

        best_score = float('inf')

        for model_name, config in models.items():
            print(f"\nOptimizing {model_name}...")

            def objective(trial):
                params = {
                    name: (trial.suggest_int if isinstance(range, tuple) and isinstance(range[0], int)
                          else trial.suggest_float)(name, *range)
                    for name, range in config['params'].items()
                }
                params['random_state'] = 42

                model = config['model'](**params)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                return mean_squared_error(y_val, y_pred)

            study = optuna.create_study(direction='minimize')
            study.optimize(objective, n_trials=n_trials)

            # Train model with best parameters
            model = config['model'](**study.best_params, random_state=42)
            model.fit(X_train, y_train)

            # Evaluate
            y_pred = model.predict(X_val)
            mse = mean_squared_error(y_val, y_pred)

            print(f"{model_name} MSE: {mse:.4f}")

            if mse < best_score:
                best_score = mse
                self.best_model = model

    def predict(self, test_file, output_file):
        print("Generating predictions...")
        X_test, post_ids = self.load_data(test_file, is_training=False)

        # Scale features
        X_test_scaled = self.scaler.transform(X_test)

        # Make predictions
        log_predictions = self.best_model.predict(X_test_scaled)
        predictions = inverse_log_transform(log_predictions)
        predictions = np.maximum(0, predictions)

        # Save predictions
        with open(output_file, 'w') as f:
            for post_id, pred in zip(post_ids, predictions):
                f.write(json.dumps([post_id, int(round(pred))]) + '\n')

        print(f"Predictions saved to {output_file}")

def main():
    # File paths
    training_file = "/content/drive/MyDrive/ColabNotebooks/CS-412_ML-Project/dataset/training-dataset.jsonl"
    test_file = "/content/drive/MyDrive/ColabNotebooks/CS-412_ML-Project/dataset/test-regression-round3.jsonl"
    predictions_output = "prediction-regression-round3.jsonl"

    # Initialize and run pipeline
    predictor = InstagramEngagementPredictor()
    predictor.train_and_evaluate(training_file)
    predictor.predict(test_file, predictions_output)
if __name__ == "__main__":
    main()

Loading training data...


[I 2025-01-12 19:36:39,579] A new study created in memory with name: no-name-a38738ba-2727-4d89-96ee-9beee87b46be



Optimizing RandomForest...


[I 2025-01-12 19:44:21,056] Trial 0 finished with value: 0.0022331472528910224 and parameters: {'n_estimators': 282, 'max_depth': 19, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0022331472528910224.
[I 2025-01-12 19:51:22,349] Trial 1 finished with value: 0.007038977227233935 and parameters: {'n_estimators': 429, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.0022331472528910224.
[I 2025-01-12 20:02:57,723] Trial 2 finished with value: 0.0022998528458576706 and parameters: {'n_estimators': 476, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.0022331472528910224.
[I 2025-01-12 20:12:18,144] Trial 3 finished with value: 0.002397880571353724 and parameters: {'n_estimators': 371, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.0022331472528910224.
[I 2025-01-12 20:18:46,343] Trial 4 finished with value: 0.00258886747

RandomForest MSE: 0.0022

Optimizing XGBoost...


[I 2025-01-12 20:26:32,376] Trial 0 finished with value: 0.002966840691203825 and parameters: {'n_estimators': 571, 'max_depth': 9, 'learning_rate': 0.18402231283944745, 'subsample': 0.8758857588695288, 'colsample_bytree': 0.8874321929620825, 'min_child_weight': 5}. Best is trial 0 with value: 0.002966840691203825.
[I 2025-01-12 20:26:51,713] Trial 1 finished with value: 0.0030083868181762607 and parameters: {'n_estimators': 321, 'max_depth': 10, 'learning_rate': 0.012649650579138413, 'subsample': 0.8075286864638349, 'colsample_bytree': 0.9543357831918564, 'min_child_weight': 5}. Best is trial 0 with value: 0.002966840691203825.
[I 2025-01-12 20:27:01,157] Trial 2 finished with value: 0.00231242525286502 and parameters: {'n_estimators': 319, 'max_depth': 7, 'learning_rate': 0.05542103859515586, 'subsample': 0.8712902499575725, 'colsample_bytree': 0.8739136660688364, 'min_child_weight': 1}. Best is trial 2 with value: 0.00231242525286502.
[I 2025-01-12 20:27:07,198] Trial 3 finished wit

XGBoost MSE: 0.0021
Generating predictions...
Predictions saved to prediction-regression-round3.jsonl


In [24]:
import json
import pandas as pd
import numpy as np
from datetime import datetime
import emoji
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Utility functions
def extract_emojis_and_tags(text):
    if not text:
        return "", "", []
    emojis = ''.join(char for char in text if char in emoji.EMOJI_DATA)
    hashtags = re.findall(r"#\w+", text)
    clean_text = re.sub(r"#\w+", "", text)
    clean_text = ''.join(char for char in clean_text if char not in emoji.EMOJI_DATA)
    return clean_text.strip(), emojis, hashtags

def process_timestamp(timestamp):
    try:
        dt = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
        return dt.year, dt.month, dt.day, dt.hour
    except (ValueError, TypeError):
        return None, None, None, None

def to_log_scale(value):
    try:
        value = float(value if value is not None else 0)
        return np.log10(1 + value) if value > 0 else 0
    except (ValueError, TypeError):
        return 0

def inverse_log_transform(y_pred):
    return (10 ** y_pred) - 1

def safe_get(dictionary, key, default=0):
    """Safely get a value from a dictionary, handling None values"""
    value = dictionary.get(key, default)
    return value if value is not None else default

class InstagramEngagementPredictor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.best_model = None

    def extract_features(self, record, is_training=True):
        features = []
        labels = []
        post_ids = []

        posts = record.get('posts', []) if is_training else [record]
        profile = record.get('profile', {}) if record.get('profile') is not None else {}

        for post in posts:
            # Get basic counts with safe handling of None values
            like_count = safe_get(post, 'like_count', 0)
            comment_count = safe_get(post, 'comments_count', 0)
            follower_count = safe_get(profile, 'follower_count', 1)
            following_count = safe_get(profile, 'following_count', 0)

            # Process timestamp
            year, month, day, hour = process_timestamp(safe_get(post, 'timestamp'))

            # Process caption
            caption = safe_get(post, 'caption', '')
            clean_caption, emojis, hashtags = extract_emojis_and_tags(caption)

            # Calculate engagement metrics
            engagement_rate = (float(like_count) + float(comment_count)) / float(follower_count)
            comment_to_follower_ratio = float(comment_count) / float(follower_count)
            following_to_follower_ratio = float(following_count) / float(follower_count)

            feature_dict = {
                'engagement_rate': engagement_rate,
                'follower_count': to_log_scale(follower_count),
                'comments_count': to_log_scale(comment_count),
                'caption_length': len(clean_caption),
                'emoji_count': len(emojis),
                'hashtag_count': len(hashtags),
                'following_count': to_log_scale(following_count),
                'highlight_reel_count': safe_get(profile, 'highlight_reel_count', 0),
                'day': day if day is not None else 1,
                'hour': hour if hour is not None else 0,
                'comment_to_follower_ratio': comment_to_follower_ratio,
                'following_to_follower_ratio': following_to_follower_ratio
            }

            features.append(feature_dict)
            if is_training:
                labels.append(to_log_scale(like_count))
            post_ids.append(safe_get(post, 'id'))

        return features, labels, post_ids

    def load_data(self, file_path, is_training=True):
        features_list = []
        labels_list = []
        all_post_ids = []

        with open(file_path, 'r') as f:
            for line in f:
                record = json.loads(line)
                features, labels, post_ids = self.extract_features(record, is_training)
                features_list.extend(features)
                if is_training:
                    labels_list.extend(labels)
                all_post_ids.extend(post_ids)

        X = pd.DataFrame(features_list)
        X = X.fillna(0)
        if is_training:
            return X, np.array(labels_list)
        return X, all_post_ids

    def train_and_evaluate(self, training_file):
        # Load and prepare training data
        print("Loading training data...")
        X, y = self.load_data(training_file)

        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

        # Split data
        X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

        # Define and train models with default parameters
        models = {
            'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42),
            'XGBoost': XGBRegressor(n_estimators=200, random_state=42)
        }

        best_score = float('inf')

        # Train and evaluate each model
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            mse = mean_squared_error(y_val, y_pred)
            print(f"{name} MSE: {mse:.4f}")

            if mse < best_score:
                best_score = mse
                self.best_model = model
                print(f"New best model: {name}")

    def predict(self, test_file, output_file):
        print("Generating predictions...")
        X_test, post_ids = self.load_data(test_file, is_training=False)

        # Scale features
        X_test_scaled = self.scaler.transform(X_test)

        # Make predictions
        log_predictions = self.best_model.predict(X_test_scaled)
        predictions = inverse_log_transform(log_predictions)
        predictions = np.maximum(0, predictions)

        # Save predictions
        with open(output_file, 'w') as f:
            for post_id, pred in zip(post_ids, predictions):
                f.write(json.dumps([post_id, int(round(pred))]) + '\n')

        print(f"Predictions saved to {output_file}")

def main():
    # File paths
    training_file = "/content/drive/MyDrive/ColabNotebooks/CS-412_ML-Project/dataset/training-dataset.jsonl"
    test_file = "/content/drive/MyDrive/ColabNotebooks/CS-412_ML-Project/dataset/test-regression-round3.json"
    predictions_output = "prediction-regression-round3.jsonl"

    # Initialize and run pipeline
    predictor = InstagramEngagementPredictor()
    predictor.train_and_evaluate(training_file)
    predictor.predict(test_file, predictions_output)
if __name__ == "__main__":
    main()

Loading training data...

Training RandomForest...
RandomForest MSE: 0.0020
New best model: RandomForest

Training XGBoost...
XGBoost MSE: 0.0030
Generating predictions...
Predictions saved to prediction-regression-round3.jsonl


