In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file_path = "/kaggle/input/social-media-performance-and-engagement-data/social_media_performance.csv"

In [None]:
df = pd.read_csv(file_path)

In [None]:
df.info()

df = df.drop(columns=['views','likes','comments','shares'])
df.info()

In [None]:
df['post_datetime'] = pd.to_datetime(df['post_datetime'])

In [None]:
df.info()


print(df)


In [None]:
df = df.drop(columns=['hashtags'])
df

In [None]:
x = df.drop(columns=['is_viral','post_datetime'],axis=1)
y = df['is_viral']

In [None]:
#cloumn Transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

cat_cols = ['platform', 'content_type', 'topic', 'language', 'region']

transformer = ColumnTransformer(transformers=[
    ('tf1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),cat_cols)
],remainder ='passthrough')








In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:

pipeline = Pipeline([
    ('transformer', transformer),
    ('classifier', XGBClassifier(n_estimators=100,eval_metric='logloss',random_state=42))
])

In [None]:
eval_set = [(X_train, y_train), (X_test, y_test)]

In [None]:
pipeline.fit(X_train, y_train)
print(pipeline.score(X_test, y_test))


y_pred=pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Pipeline Confusion Matrix')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


cv_scores = cross_val_score(pipeline, x, y, cv=5, scoring='f1')
print(f"CV F1 Scores: {cv_scores}")
print(f"Mean CV F1: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

In [None]:
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print(f"Train Accuracy: {train_score:.3f}")
print(f"Test Accuracy: {test_score:.3f}")
print(f"Overfitting Gap: {train_score-test_score:.3f}")


In [None]:
y_pred = pipeline.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:

print("=== MODEL HEALTH CHECK ===")
print(f"CV F1: {cv_scores.mean():.3f}")
print(f"Train Acc: {train_score:.3f}, Test Acc: {test_score:.3f}")
print(f"Class Balance: \n{df['is_viral'].value_counts(normalize=True)}")


importances = pd.Series(pipeline.named_steps['classifier'].feature_importances_, 
                       index=pipeline.named_steps['transformer'].get_feature_names_out())
print("\nTop Features:", importances.nlargest(5).index.tolist())


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


cv_scores = cross_val_score(pipeline, x, y, cv=5, scoring='f1')
print("=== CROSS-VALIDATION ===")
print(f"CV F1 Scores: {cv_scores}")
print(f"Mean CV F1: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print("\n=== TRAIN-TEST PERFORMANCE ===")
print(f"Train Accuracy: {train_score:.3f}")
print(f"Test Accuracy:  {test_score:.3f}")
print(f"Gap: {train_score-test_score:.3f}")


y_pred = pipeline.predict(X_test)
print("\n=== CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_pred))

print("\n=== DATA BALANCE ===")
print(df['is_viral'].value_counts(normalize=True))


In [None]:
import joblib
joblib.dump(pipeline, 'viral_predictor_pipeline.pkl')
print("✅ Model saved as 'viral_predictor_pipeline.pkl'")