# 1) Data preprocessing and feature extraction

In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
# Load JSONL file
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    print(f"Loaded {len(data)} records from {file_path}.")
    return data

In [3]:
# Extract features from the training dataset
def extract_training_data(training_data):
    post_data = []
    profile_data = []

    for item in training_data:
        # Extract profile-level data
        profile = item.get('profile', {})
        profile_id = profile.get('id', 'UNKNOWN')
        username = profile.get('username', 'UNKNOWN')
        post_count = profile.get('post_count', 0)
        follower_count = profile.get('follower_count', 0)
        highlight_reel_count = profile.get('highlight_reel_count', 0)
        
        # Save profile-level data
        profile_data.append({
            'profile_id': profile_id,
            'username': username,
            'post_count': post_count,
            'follower_count': follower_count,
            'highlight_reel_count': highlight_reel_count
        })
        
        # Extract post-level data
        for post in item.get('posts', []):
            post_id = post.get('id', 'UNKNOWN')
            caption = post.get('caption', '')
            like_count = post.get('like_count')
            comments_count = post.get('comments_count', 0)
            media_type = post.get('media_type', 'UNKNOWN')
            
            # Include only if like_count is not missing
            if like_count is not None:
                post_data.append({
                    'post_id': post_id,
                    'caption': caption,
                    'like_count': like_count,
                    'comments_count': comments_count,
                    'media_type': media_type,
                    'profile_id': profile_id  # Link to profile
                })
    
    # Convert to DataFrames
    df_posts = pd.DataFrame(post_data)
    df_profiles = pd.DataFrame(profile_data).drop_duplicates(subset='profile_id')  # Avoid duplicate profiles
    
    # Join profile data to posts based on profile_id
    df_combined = df_posts.merge(df_profiles, on='profile_id', how='left')
    
    print(f"Extracted {len(df_combined)} rows of data after combining posts and profiles.")
    return df_combined

In [4]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler


def preprocess_combined_data(df):
    # Ensure no None or NaN values in the 'caption' column
    df['caption'] = df['caption'].fillna('').astype(str)

    # Tokenize captions for Word2Vec
    tokenized_captions = df['caption'].apply(lambda x: x.split())

    # Train a Word2Vec model
    word2vec_model = Word2Vec(sentences=tokenized_captions, vector_size=100, window=7, min_count=1, sg=0)

    # Function to average word vectors for each caption
    def get_caption_vector(tokens):
        vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(word2vec_model.vector_size)

    # Apply the function to compute caption embeddings
    caption_vectors = np.array([get_caption_vector(tokens) for tokens in tokenized_captions])
    print(f"Word2Vec caption vectors shape: {caption_vectors.shape}")

    # Encode media type as a numerical feature
    media_type_mapping = {media: idx for idx, media in enumerate(df['media_type'].unique())}
    df['media_type_encoded'] = df['media_type'].map(media_type_mapping)

    # Handle heavy-tailed distributions with log transformation
    df['comments_count_log'] = np.log1p(df['comments_count'])

    # Feature engineering: followers per post and comments-to-followers ratio
    df['followers_per_post'] = df['follower_count'] / (df['post_count'] + 1)  # Avoid division by zero
    df['comments_to_followers_ratio'] = df['comments_count'] / (df['follower_count'] + 1)

    # Replace infinite values and handle NaNs
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)

    # Remove 'like_count' from training and set target variable
    y = df['like_count']
    df.drop('like_count', axis=1, inplace=True)

    # Combine features: Word2Vec vectors and other engineered features
    features = pd.DataFrame(caption_vectors)
    features['comments_count_log'] = df['comments_count_log']
    features['media_type_encoded'] = df['media_type_encoded']
    features['post_count'] = df['post_count']
    features['follower_count'] = df['follower_count']
    features['highlight_reel_count'] = df['highlight_reel_count']
    features['followers_per_post'] = df['followers_per_post']
    features['comments_to_followers_ratio'] = df['comments_to_followers_ratio']

    # Ensure all column names are strings
    features.columns = features.columns.astype(str)
    print("Feature columns:", features.columns)

    # Standardize the features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    print("Features standardized.")

    # Return processed features and target variable
    return features_scaled, np.log1p(y), word2vec_model, scaler, features.columns, media_type_mapping

# Example call (assuming `df` is your DataFrame)
# features_scaled, y_log, word2vec_model, scaler = preprocess_combined_data(df)


In [5]:
# Load and preprocess the training data
training_data_path = 'training-dataset.jsonl'
training_data = load_jsonl(training_data_path)

# Extract and preprocess data
df_combined  = extract_training_data(training_data)
df_combined.head()

Loaded 5415 records from training-dataset.jsonl.
Extracted 183083 rows of data after combining posts and profiles.


Unnamed: 0,post_id,caption,like_count,comments_count,media_type,profile_id,username,post_count,follower_count,highlight_reel_count
0,17990918969458720,Cumhuriyetimizin 100.yılı kutlu olsun♾️🇹🇷,6.0,0,IMAGE,3170700063,deparmedya,,1167,6
1,18219250732221045,Oriflame Duologi Lansmanı #isveçtengelengüzell...,22.0,1,VIDEO,3170700063,deparmedya,,1167,6
2,18311380465102328,#oriflameilesaçbakımdevrimi ✌️,19.0,0,VIDEO,3170700063,deparmedya,,1167,6
3,18089518138361507,✌️#oriflameilesaçbakımdevrimi 07Agustos’23 ori...,19.0,1,VIDEO,3170700063,deparmedya,,1167,6
4,18012743929758497,07 Agustos’23 #oriflameturkiye #duoloji,21.0,0,VIDEO,3170700063,deparmedya,,1167,6


In [6]:
X, y, word2vec_model, feature_scaler, feature_columns, media_type_mapping = preprocess_combined_data(df_combined)

Word2Vec caption vectors shape: (183083, 100)
Feature columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '97', '98', '99', 'comments_count_log', 'media_type_encoded',
       'post_count', 'follower_count', 'highlight_reel_count',
       'followers_per_post', 'comments_to_followers_ratio'],
      dtype='object', length=107)
Features standardized.


# 3) XGBoost Regression

In [7]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

Training set shape: (146466, 107), Test set shape: (36617, 107)


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

# Instantiate and fit the XGBoost regressor
xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=8, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Train Mean Squared Error (MSE): {mean_squared_error(y_train, y_train_pred):.4f}")
print(f"Train R-squared (R2): {r2_score(y_train, y_train_pred):.4f}")
print(f"Test Mean Squared Error (MSE): {mse:.4f}")
print(f"Test R-squared (R2): {r2:.4f}")


Training set shape: (146466, 107), Test set shape: (36617, 107)
Train Mean Squared Error (MSE): 0.5906
Train R-squared (R2): 0.8978
Test Mean Squared Error (MSE): 0.7361
Test R-squared (R2): 0.8723


In [9]:
# #max_depth changed from 5 to 12
# from sklearn.metrics import mean_squared_error, r2_score
# from xgboost import XGBRegressor

# # Instantiate and fit the XGBoost regressor
# xgb_model_4 = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=12, random_state=42)
# xgb_model_4.fit(X_train, y_train)

# # Make predictions
# y_train_pred = xgb_model_4.predict(X_train)
# y_test_pred = xgb_model_4.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_test_pred)
# r2 = r2_score(y_test, y_test_pred)

# print(f"Train Mean Squared Error (MSE): {mean_squared_error(y_train, y_train_pred):.4f}")
# print(f"Train R-squared (R2): {r2_score(y_train, y_train_pred):.4f}")
# print(f"Test Mean Squared Error (MSE): {mse:.4f}")
# print(f"Test R-squared (R2): {r2:.4f}")


In [10]:
# import pandas as pd
# from xgboost import XGBRegressor
# from sklearn.metrics import mean_squared_error, r2_score

# # Hyperparameter grid
# max_depth_values = [8, 10, 12]
# n_estimators_values = [100, 200, 300]
# learning_rate_values = [0.1, 0.01, 1.5]

# # Store results
# results = []

# # Loop over each combination of hyperparameters
# for max_depth in max_depth_values:
#     for n_estimators in n_estimators_values:
#         for learning_rate in learning_rate_values:
#             # Train the model
#             model = XGBRegressor(objective='reg:squarederror',
#                                  n_estimators=n_estimators,
#                                  learning_rate=learning_rate,
#                                  max_depth=max_depth,
#                                  random_state=42)
            
#             model.fit(X_train, y_train)

#             # Make predictions
#             y_train_pred = model.predict(X_train)
#             y_test_pred = model.predict(X_test)

#             # Evaluate performance
#             train_r2 = r2_score(y_train, y_train_pred)
#             test_r2 = r2_score(y_test, y_test_pred)

#             # Append results
#             results.append({
#                 'max_depth': max_depth,
#                 'n_estimators': n_estimators,
#                 'learning_rate': learning_rate,
#                 'train_r2': train_r2,
#                 'test_r2': test_r2
#             })

# # Convert results to a DataFrame
# results_df = pd.DataFrame(results)

# # Display the results as a table sorted by test R^2
# print("Hyperparameter tuning results:")
# print(results_df.sort_values(by='test_r2', ascending=False))


# Predictions for test data

In [11]:
# function for getting profile-level information
def get_profile_info(username, df_combined):
    return df_combined[df_combined['username'] == username].head(1) 

In [16]:
def preprocess_test_data_with_missing(data, word2vec_model, scaler, feature_columns, media_type_mapping, profile_info):
    # Create a DataFrame from the single data instance
    df_test = pd.DataFrame([data])

    # Ensure no None, NaN, or missing values in the 'caption' column
    if 'caption' not in df_test.columns:
        df_test['caption'] = ''
    df_test['caption'] = df_test['caption'].fillna('').astype(str)

    # Tokenize and compute Word2Vec embedding
    tokenized_caption = df_test['caption'].iloc[0].split()

    def get_caption_vector(tokens):
        vectors = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(word2vec_model.vector_size)

    caption_vector = get_caption_vector(tokenized_caption)
    caption_features = pd.DataFrame([caption_vector])

    # Add profile-level features
    for field in ['post_count', 'follower_count', 'highlight_reel_count', 'followers_per_post', 'comments_to_followers_ratio']:
        df_test[field] = profile_info.get(field, 0)

    # Handle log transformations
    for field in ['post_count', 'follower_count', 'highlight_reel_count', 'comments_count', 'followers_per_post', 'comments_to_followers_ratio']:
        df_test[f'{field}_log'] = np.log1p(df_test.get(field, 0))

    # Encode media type
    if 'media_type' not in df_test.columns:
        df_test['media_type'] = "IMAGE"
    df_test['media_type_encoded'] = df_test['media_type'].map(media_type_mapping).fillna(0)

    # Replace infinite values and handle NaNs
    df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_test.fillna(0, inplace=True)

    # Combine all features
    features = caption_features
    features.columns = features.columns.astype(str)

    # Add engineered features
    for feature in ['comments_count_log', 'media_type_encoded', 'post_count_log', 'follower_count_log', 
                    'highlight_reel_count_log', 'followers_per_post_log', 'comments_to_followers_ratio_log']:
        features[feature] = df_test.get(feature, 0)

    # Align feature columns with training data order
    for col in feature_columns:
        if col not in features.columns:
            features[col] = 0  # Ensure missing columns are filled with zero

    # Ensure column order matches the training feature set
    features = features[feature_columns]

    # Standardize features
    features_scaled = scaler.transform(features)

    return features_scaled


In [20]:
# Process JSONL file and predict
input_file = "test-regression-round3.jsonl"
output_file = "prediction-regression-round4.json"

predictions = {}

results = {}
with open(input_file, 'r') as file:
    for line in file:
        data = json.loads(line)
        profile_info = get_profile_info(data.get("username"), df_combined)
        features_scaled = preprocess_test_data_with_missing(data, word2vec_model, feature_scaler, feature_columns, media_type_mapping, profile_info)
        # log_like_count = rf_regressor.predict(features_scaled)[0]
        log_like_count = xgb_model.predict(features_scaled)[0]
        like_count = np.expm1(log_like_count)  # Convert log count back to normal count
        results[data['id']] = int(round(like_count))  # Round and convert to integer

print(f"Predictions are completed for {input_file}")

with open(output_file, 'w') as file:
    json.dump(results, file, indent=4)
# 
print(f"Predictions saved to {output_file}")

Predictions are completed for test-regression-round3.jsonl
Predictions saved to prediction-regression-round4.json


In [18]:
features_scaled.shape

(1, 107)