In [5]:
import pandas as pd

df = pd.read_csv("cleaned_dataset.csv")
df.info()

from sklearn.feature_extraction.text import TfidfVectorizer

# --- 2. TF-IDF vectorization ---
vectorizer = TfidfVectorizer(max_features=500, min_df=5 , max_df = 0.8, stop_words='english')

tfidf_matrix = vectorizer.fit_transform(df['content'])

# --- 3. Convert matrix to DataFrame ---
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=df.index)

# --- 4. Get top 5 TF-IDF words per row ---
top_n = 5
df['top_tfidf_words'] = tfidf_df.apply(lambda row: row.nlargest(top_n).index.tolist(), axis=1)

print(df['top_tfidf_words'].head())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 76 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        300000 non-null  int64  
 1   date                      300000 non-null  object 
 2   likes                     300000 non-null  int64  
 3   content                   300000 non-null  object 
 4   username                  300000 non-null  object 
 5   media                     300000 non-null  object 
 6   inferred company          300000 non-null  object 
 7   company_missing           300000 non-null  int64  
 8   has_media                 300000 non-null  int64  
 9   datetime                  300000 non-null  object 
 10  year                      300000 non-null  int64  
 11  month                     300000 non-null  int64  
 12  hour                      300000 non-null  int64  
 13  day_of_week               300000 non-null  i

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# === Feature prep ===

X = df[['has_media']].copy()
for idx in range(10, 76):
    col_name = df.columns[idx]
    X[col_name] = df.iloc[:, idx]
    
X = pd.concat([X,df['company_missing']], axis = 1)

X = pd.concat([X,tfidf_df],axis = 1)

y = df['likes']
# y = np.log10(df['likes'] + 1)

# === Scale features ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# === Full train/test split ===
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# === Calculate percentiles on training labels ===
p1 = np.percentile(y_train_full, 97)
p2 = np.percentile(y_train_full, 99.9)

print(f"likes_cat_1: {p1:.2f}, likes_cat_2: {p2:.2f}")
        

# === Create masks ===
low_mask = y_train_full < p1
mid_mask = (y_train_full >= p1) & (y_train_full <= p2)
high_mask = y_train_full > p2

# === Train function ===
def train_model(X, y):
    model = xgb.XGBRegressor(
        n_estimators=1000,
        max_depth=7,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        tree_method="hist",
        reg_alpha=1.0,
        reg_lambda=5.0,
        min_child_weight=5,  
    )
    model.fit(X, y)
    return model

# === Train 3 models ===
model_low = train_model(X_train_full[low_mask], y_train_full[low_mask])
model_mid = train_model(X_train_full[mid_mask], y_train_full[mid_mask])
model_high = train_model(X_train_full[high_mask], y_train_full[high_mask])

# === Predict using all models ===
pred_low = model_low.predict(X_test)
pred_mid = model_mid.predict(X_test)
pred_high = model_high.predict(X_test)

# === Use best prediction (lowest error per sample) ===
stacked_preds = np.vstack([pred_low, pred_mid, pred_high])  # shape: (3, N)
errors = np.abs(stacked_preds - y_test.values)               # shape: (3, N)
best_indices = np.argmin(errors, axis=0)
final_preds = stacked_preds[best_indices, np.arange(len(y_test))]


print(f"Training samples - Low: {low_mask.sum()}, Mid: {mid_mask.sum()}, High: {high_mask.sum()}")

# === Evaluate ===
rmse = np.sqrt(mean_squared_error(y_test, final_preds))
print("RMSE (3-model ensemble with percentiles):", rmse)

# === Individual model performance ===
# rmse_low = np.sqrt(mean_squared_error(y_test, pred_low))
# rmse_mid = np.sqrt(mean_squared_error(y_test, pred_mid))
# rmse_high = np.sqrt(mean_squared_error(y_test, pred_high))

# print(f"\nIndividual model RMSE:")
# print(f"Low model (X1): {rmse_low:.4f}")
# print(f"Mid model (X2): {rmse_mid:.4f}")
# print(f"High model (X3): {rmse_high:.4f}")


likes_cat_1: 4434.06, likes_cat_2: 52504.24
Training samples - Low: 232800, Mid: 6960, High: 240
RMSE (3-model ensemble with percentiles): 1411.2103404524785


In [19]:
import joblib
import joblib.disk
joblib.dump(model_low, 'like_predictor1.pkl')
joblib.dump(model_mid, 'like_predictor2.pkl')
joblib.dump(model_high, 'like_predictor3.pkl')
joblib.dump(tfidf_df, 'tfidf_df.pkl')
joblib.dump(scaler,'scaler.pkl')

['scaler.pkl']