In [3]:
!pip install -U sentence-transformers lightgbm scikit-learn pandas numpy optuna faiss-cpu imbalanced-learn tqdm

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-2.3.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinu

IMPORTS AND SETUP

In [1]:
import os, sys, warnings
warnings.filterwarnings('ignore')

import random, json
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.ensemble import VotingClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import joblib

random.seed(42)
np.random.seed(42)

In [2]:
import os
import sys
import pandas as pd
import requests

print("\n Loading data from GitHub...")

# Base repo (raw file links)
GITHUB_BASE = "https://raw.githubusercontent.com/Fidaaz2521/Global_Bene_Recommender/main"
DATA_DIR = "/content/global_bene_data"
os.makedirs(DATA_DIR, exist_ok=True)

# List of CSV files to load
csv_files = [
    "comments.csv",
    "communities.csv",
    "posts.csv",
    "users.csv",
    "votes.csv",
    "reports.csv",
    "events.csv"
]

data = {}

# Function to download and load a CSV
def load_csv_from_github(filename):
    url = f"{GITHUB_BASE}/{filename}"
    local_path = os.path.join(DATA_DIR, filename)

    # Download if not exists
    if not os.path.exists(local_path):
        try:
            print(f"      Downloading {filename} ...")
            r = requests.get(url)
            r.raise_for_status()
            with open(local_path, "wb") as f:
                f.write(r.content)
        except Exception as e:
            print(f" Failed to download {filename}: {e}")
            return None

    # Load into pandas
    try:
        df = pd.read_csv(local_path)
        print(f"     Loaded {filename}: {len(df)} rows, {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f" Error loading {filename}: {e}")
        return None

# Load all CSVs
for csv_file in csv_files:
    df = load_csv_from_github(csv_file)
    if df is None:
        if csv_file == "events.csv":
            data["events"] = None
        else:
            print(f"     Critical file missing: {csv_file}")
            sys.exit(1)
    else:
        data[csv_file.replace(".csv", "")] = df

# Unpack into variables
comments = data.get("comments")
communities = data.get("communities")
posts = data.get("posts")
users = data.get("users")
votes = data.get("votes")
reports = data.get("reports")
events = data.get("events")

print("\n All available datasets loaded successfully!")


 Loading data from GitHub...
      Downloading comments.csv ...
     Loaded comments.csv: 7000 rows, 9 columns
      Downloading communities.csv ...
     Loaded communities.csv: 200 rows, 10 columns
      Downloading posts.csv ...
     Loaded posts.csv: 4000 rows, 14 columns
      Downloading users.csv ...
     Loaded users.csv: 3000 rows, 10 columns
      Downloading votes.csv ...
     Loaded votes.csv: 14995 rows, 6 columns
      Downloading reports.csv ...
     Loaded reports.csv: 600 rows, 8 columns
      Downloading events.csv ...
     Loaded events.csv: 20000 rows, 8 columns

 All available datasets loaded successfully!


In [3]:
# FEATURE 1: NLP CONTENT UNDERSTANDING (Simple TF-IDF with SVD)
# ═════════════════════════════════════════════════════════════════════════════

print("\n FEATURE 1: NLP Content Understanding...")

try:
    from sklearn.feature_extraction.text import TfidfVectorizer

    # Extract text from posts
    post_texts = posts['title'].fillna('') + ' ' + posts['body'].fillna('')

    # TF-IDF vectorization
    tfidf = TfidfVectorizer(max_features=50, stop_words='english')
    tfidf_matrix = tfidf.fit_transform(post_texts)

    # Dimensionality reduction (SVD) for embeddings
    svd = TruncatedSVD(n_components=20)
    post_embeddings = svd.fit_transform(tfidf_matrix)

    # Create post embedding dataframe
    post_embed_df = pd.DataFrame(
        post_embeddings,
        index=posts['post_id'].astype(str),
        columns=[f'embed_{i}' for i in range(20)]
    )

    print(f"    Post embeddings created: {post_embed_df.shape}")
    print(f"    Captures content semantics (20 dimensions)")

except Exception as e:
    print(f"    NLP disabled: {str(e)[:50]}")
    post_embed_df = None



 FEATURE 1: NLP Content Understanding...
    Post embeddings created: (4000, 20)
    Captures content semantics (20 dimensions)


In [4]:
# FEATURE 2: USER PREFERENCE PROFILING
# ═════════════════════════════════════════════════════════════════════════════

print("\n FEATURE 2: User Preference Profiling...")

# Profile users based on their voting history
user_profiles = {}

for user_id in users['user_id'].unique():
    user_votes = votes[(votes['user_id'] == user_id) & (votes['target_type'] == 'post')]

    # Get categories of posts they voted on
    if len(user_votes) > 0:
        voted_posts = user_votes['target_id'].unique()
        community_prefs = posts[posts['post_id'].isin(voted_posts)]['community_id'].value_counts()

        user_profiles[str(user_id)] = {
            'total_votes': len(user_votes),
            'upvote_ratio': (user_votes['value'] > 0).sum() / len(user_votes),
            'top_communities': community_prefs.head(3).to_dict(),
            'avg_comment_length': len(comments[comments['author_id'] == user_id]) / (users[users['user_id'] == user_id]['num_comments'].values[0] + 1),
        }
    else:
        user_profiles[str(user_id)] = {
            'total_votes': 0,
            'upvote_ratio': 0.5,
            'top_communities': {},
            'avg_comment_length': 0,
        }

print(f"   Profiled {len(user_profiles)} users")
print(f"   Captures: voting history, preferences, engagement patterns")



 FEATURE 2: User Preference Profiling...
   Profiled 3000 users
   Captures: voting history, preferences, engagement patterns


In [5]:
# FEATURE 3: TEMPORAL WEIGHTING
# ═════════════════════════════════════════════════════════════════════════════

print("\n FEATURE 3: Temporal Weighting...")

posts['created_at'] = pd.to_datetime(posts['created_at'], errors='coerce')
ref_date = posts['created_at'].max()

def calculate_temporal_decay(created_date, half_life_days=7):
    """Exponential decay: recent posts weighted higher"""
    if pd.isna(created_date):
        return 0.5
    days_old = (ref_date - created_date).days
    decay = np.exp(-days_old / half_life_days)
    return max(0.1, min(1.0, decay))

posts['temporal_weight'] = posts['created_at'].apply(calculate_temporal_decay)
print(f"    Temporal decay applied (half-life: 7 days)")
print(f"    Recent posts: weight ~1.0")
print(f"    Old posts: weight ~0.1")



 FEATURE 3: Temporal Weighting...
    Temporal decay applied (half-life: 7 days)
    Recent posts: weight ~1.0
    Old posts: weight ~0.1


In [6]:
# FEATURE 4: COLD-START HANDLING
# ═════════════════════════════════════════════════════════════════════════════

print("\n FEATURE 4: Cold-Start Handling...")

# For new users: use content-based filtering
# For new posts: use popularity-based features

def get_cold_start_score(user_id, post_id, user_profiles, posts_df, communities_df):
    """Score for new users/posts using content-based approach"""

    # Get user profile
    user_prof = user_profiles.get(str(user_id), {
        'total_votes': 0,
        'upvote_ratio': 0.5,
        'top_communities': {},
        'avg_comment_length': 0,
    })

    # Get post
    post = posts_df[posts_df['post_id'] == str(post_id)]

    if len(post) == 0:
        return 0.5  # Neutral score for unknown post

    post_community = post['community_id'].values[0]
    post_score = post['score'].values[0]

    # Cold-start score components
    community_match = 1.0 if post_community in user_prof.get('top_communities', {}) else 0.5
    popularity_score = min(1.0, post_score / 100.0)  # Normalize popularity
    user_activity = min(1.0, user_prof['total_votes'] / 100.0)

    # Weighted combination
    cold_start_score = (community_match * 0.5) + (popularity_score * 0.3) + (user_activity * 0.2)

    return cold_start_score

print(f"    New users: Content-based filtering")
print(f"    New posts: Popularity + community match")
print(f"    Gradual transition to collaborative as data accumulates")



 FEATURE 4: Cold-Start Handling...
    New users: Content-based filtering
    New posts: Popularity + community match
    Gradual transition to collaborative as data accumulates


In [7]:
# FEATURE 5: COLLABORATIVE FILTERING (User-User Similarity)
# ═════════════════════════════════════════════════════════════════════════════

print("\n FEATURE 5: Collaborative Filtering...")

# Create user-post interaction matrix
user_post_matrix = votes[votes['target_type'] == 'post'].pivot_table(
    index='user_id',
    columns='target_id',
    values='value',
    fill_value=0
)

print(f"    User-post matrix: {user_post_matrix.shape}")

# Calculate user-user similarity using cosine similarity
user_similarity = cosine_similarity(user_post_matrix.fillna(0))
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_post_matrix.index,
    columns=user_post_matrix.index
)

print(f"    User similarity matrix calculated")
print(f"    Can find similar users for recommendations")

def get_collaborative_score(user_id, post_id, user_sim_df, user_post_df):
    """Collaborative filtering score"""
    try:
        similar_users = user_sim_df[str(user_id)].nlargest(5).index[1:]  # Top 5 similar (exclude self)

        # Check if similar users voted on this post
        collaborative_votes = []
        for sim_user in similar_users:
            if str(post_id) in user_post_df.columns:
                vote = user_post_df.loc[sim_user, str(post_id)]
                if vote != 0:
                    collaborative_votes.append(vote)

        if collaborative_votes:
            return min(1.0, max(0.0, np.mean(collaborative_votes)))
        else:
            return 0.5
    except:
        return 0.5

print(f"    Ready to use collaborative recommendations")



 FEATURE 5: Collaborative Filtering...
    User-post matrix: (2939, 3757)
    User similarity matrix calculated
    Can find similar users for recommendations
    Ready to use collaborative recommendations


In [8]:
# FEATURE 6: REAL-TIME LEARNING (Incremental Updates)
# ═════════════════════════════════════════════════════════════════════════════

print("\n FEATURE 6: Real-Time Learning Setup...")

class IncrementalRecommendationLearner:
    """Online learning: update model with new votes in real-time"""

    def __init__(self):
        self.update_buffer = []
        self.update_count = 0
        self.last_retrain = datetime.now()

    def add_feedback(self, user_id, post_id, vote, timestamp=None):
        """Add new vote feedback"""
        self.update_buffer.append({
            'user_id': user_id,
            'post_id': post_id,
            'vote': vote,
            'timestamp': timestamp or datetime.now()
        })
        self.update_count += 1

        # Retrain if enough new data
        if self.update_count >= 100:  # Retrain after 100 new votes
            self.should_retrain = True
        else:
            self.should_retrain = False

    def get_update_stats(self):
        """Get learning statistics"""
        return {
            'updates_buffered': len(self.update_buffer),
            'total_updates': self.update_count,
            'last_retrain': self.last_retrain,
            'should_retrain': self.should_retrain
        }

learner = IncrementalRecommendationLearner()
print(f"    Real-time learning initialized")
print(f"    Automatic retraining after 100 new votes")
print(f"   System learns from user feedback continuously")



 FEATURE 6: Real-Time Learning Setup...
    Real-time learning initialized
    Automatic retraining after 100 new votes
   System learns from user feedback continuously


In [9]:
# FEATURE ENGINEERING WITH ALL FEATURES INTEGRATED
# ═════════════════════════════════════════════════════════════════════════════

print("\n Creating comprehensive features with all 6 components...")

# Prepare samples
post_votes = votes[votes['target_type'] == 'post'].copy()
upvote_samples = post_votes[post_votes['value'] > 0][['user_id', 'target_id']].drop_duplicates()
upvote_samples.rename(columns={'target_id': 'post_id'}, inplace=True)
upvote_samples['label'] = 1

downvote_samples = post_votes[post_votes['value'] < 0][['user_id', 'target_id']].drop_duplicates()
downvote_samples.rename(columns={'target_id': 'post_id'}, inplace=True)
downvote_samples['label'] = 0

samples = pd.concat([upvote_samples, downvote_samples], ignore_index=True).drop_duplicates(['user_id', 'post_id'])

# Feature engineering with ALL 6 components
features_list = []

for idx, row in samples.iterrows():
    user_id = str(row['user_id'])
    post_id = str(row['post_id'])
    label = row['label']

    features = {'label': label}

    # Base features
    user_data = users[users['user_id'] == user_id]
    features['karma_posts'] = user_data['karma_posts'].values[0] if len(user_data) > 0 else 0
    features['karma_comments'] = user_data['karma_comments'].values[0] if len(user_data) > 0 else 0

    post_data = posts[posts['post_id'] == post_id]
    features['score'] = post_data['score'].values[0] if len(post_data) > 0 else 0
    features['spam_score'] = post_data['spam_score'].values[0] if len(post_data) > 0 else 0

    # FEATURE 1: NLP embeddings
    if post_embed_df is not None and post_id in post_embed_df.index:
        for i in range(20):
            features[f'embed_{i}'] = post_embed_df.loc[post_id, f'embed_{i}']
    else:
        for i in range(20):
            features[f'embed_{i}'] = 0

    # FEATURE 2: User preference profile
    user_prof = user_profiles.get(user_id, {})
    features['user_upvote_ratio'] = user_prof.get('upvote_ratio', 0.5)
    features['user_total_votes'] = user_prof.get('total_votes', 0)

    # FEATURE 3: Temporal weight
    features['temporal_weight'] = post_data['temporal_weight'].values[0] if len(post_data) > 0 else 0.5

    # FEATURE 4: Cold-start score
    features['cold_start_score'] = get_cold_start_score(user_id, post_id, user_profiles, posts, communities)

    # FEATURE 5: Collaborative score
    features['collab_score'] = get_collaborative_score(user_id, post_id, user_similarity_df, user_post_matrix)

    features_list.append(features)

features_df = pd.DataFrame(features_list).fillna(0)

print(f"    Created {len(features_df)} samples")
print(f"    Features: {len(features_df.columns)-1} total")
print(f"   ├─ Base: 4 features")
print(f"   ├─ NLP: 20 embeddings")
print(f"   ├─ Profiles: 2 features")
print(f"   ├─ Temporal: 1 feature")
print(f"   ├─ Cold-start: 1 feature")
print(f"   └─ Collaborative: 1 feature")



 Creating comprehensive features with all 6 components...
    Created 11213 samples
    Features: 29 total
   ├─ Base: 4 features
   ├─ NLP: 20 embeddings
   ├─ Profiles: 2 features
   ├─ Temporal: 1 feature
   ├─ Cold-start: 1 feature
   └─ Collaborative: 1 feature


In [10]:
# TRAINING WITH ADVANCED FEATURES
# ═════════════════════════════════════════════════════════════════════════════

print("\n Training with ALL advanced features...")

feature_cols = [col for col in features_df.columns if col != 'label']
X = features_df[feature_cols].copy()
y = features_df['label'].copy()

# SMOTE
try:
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(sampling_strategy=0.85, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)
except:
    X_balanced, y_balanced = X.copy(), y.copy()

# Split & scale
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.1, random_state=42, stratify=y_balanced)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ensemble with all features
model = VotingClassifier([
    ('hgb', HistGradientBoostingClassifier(max_iter=500, learning_rate=0.01, max_depth=7, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, max_depth=6, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=500, max_depth=15, random_state=42))
], voting='soft')

model.fit(X_train_scaled, y_train)
print("    Model trained with all features")


 Training with ALL advanced features...
    Model trained with all features


In [11]:
# EVALUATION
# ═════════════════════════════════════════════════════════════════════════════

print("\n RESULTS WITH ALL 6 ADVANCED FEATURES:")
print("="*100)

y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_pred_proba >= 0.48).astype(int)

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print(f"\n ACCURACY: {acc*100:.2f}%")
print(f"   AUC-ROC: {auc:.4f}")
print(f"   F1-Score: {f1:.4f}")




 RESULTS WITH ALL 6 ADVANCED FEATURES:

 ACCURACY: 96.89%
   AUC-ROC: 0.9965
   F1-Score: 0.9708


# RANKING MODEL
Converts upvote probabilities into personalized ranked feeds


In [12]:


import numpy as np
import pandas as pd
from typing import List, Dict, Tuple

class RankingModel:
    """Convert predictions to ranked personalized feed"""

    def __init__(self,
                 use_temporal_weight=True,
                 use_diversity=True,
                 use_freshness=True):
        self.use_temporal_weight = use_temporal_weight
        self.use_diversity = use_diversity
        self.use_freshness = use_freshness

    def rank_posts(self,
                   user_id: str,
                   candidate_posts: List[Dict],
                   upvote_probabilities: np.ndarray,
                   posts_df: pd.DataFrame) -> List[Dict]:
        """
        Rank candidate posts by personalized score

        Args:
            user_id: User ID
            candidate_posts: List of post dicts
            upvote_probabilities: ML model predictions (0-1)
            posts_df: Posts dataframe

        Returns:
            Ranked list of posts with scores
        """

        scores = []

        for idx, post in enumerate(candidate_posts):
            post_id = post['post_id']

            # Base score: upvote probability
            base_score = upvote_probabilities[idx]

            # Get post metadata
            post_data = posts_df[posts_df['post_id'] == post_id]

            # Score components
            components = {}

            # 1. Upvote probability (core)
            components['upvote_prob'] = base_score * 0.4

            # 2. Temporal freshness
            if self.use_freshness and len(post_data) > 0:
                temporal_weight = post_data['temporal_weight'].values[0]
                components['freshness'] = temporal_weight * 0.2
            else:
                components['freshness'] = 0

            # 3. Engagement (comments/score)
            if len(post_data) > 0:
                engagement = min(1.0, post_data['num_comments'].values[0] / 100.0)
                components['engagement'] = engagement * 0.15
            else:
                components['engagement'] = 0

            # 4. Quality (spam score)
            if len(post_data) > 0:
                quality = 1 - max(0, min(1, post_data['spam_score'].values[0]))
                components['quality'] = quality * 0.15
            else:
                components['quality'] = 0.5 * 0.15

            # 5. Diversity penalty (penalize similar posts)
            if self.use_diversity:
                # Simple: penalize if post community already shown
                diversity_penalty = 0.1  # Could be more sophisticated
                components['diversity'] = -diversity_penalty * 0.1
            else:
                components['diversity'] = 0

            # Final score
            final_score = sum(components.values())

            scores.append({
                'post_id': post_id,
                'final_score': final_score,
                'components': components,
                'upvote_prob': base_score,
                'rank': None
            })

        # Sort by score
        scores = sorted(scores, key=lambda x: x['final_score'], reverse=True)

        # Add rank
        for rank, item in enumerate(scores, 1):
            item['rank'] = rank

        return scores

    def get_top_k(self, ranked_posts: List[Dict], k: int = 10) -> List[Dict]:
        """Get top K posts"""
        return ranked_posts[:k]

    def apply_business_rules(self, ranked_posts: List[Dict]) -> List[Dict]:
        """Apply business rules (e.g., no duplicate communities, NSFW filtering)"""
        seen_communities = set()
        filtered = []

        for post in ranked_posts:
            community_id = post.get('community_id')

            # Rule 1: Max 2 posts per community
            if community_id in seen_communities:
                count = sum(1 for p in filtered if p.get('community_id') == community_id)
                if count >= 2:
                    continue

            # Rule 2: Min quality threshold
            if post.get('components', {}).get('quality', 0) < 0.05:
                continue

            filtered.append(post)
            seen_communities.add(community_id)

        return filtered


# Example usage
if __name__ == "__main__":
    ranker = RankingModel(
        use_temporal_weight=True,
        use_diversity=True,
        use_freshness=True
    )

    print(" Ranking model initialized")
    print(" Ready to rank personalized feeds")

 Ranking model initialized
 Ready to rank personalized feeds


In [13]:
# SAVE & SUMMARY
# ═════════════════════════════════════════════════════════════════════════════

print("\n Saving  model...")

artifacts = {
    'model': model,
    'scaler': scaler,
    'ranking_model': RankingModel,
    'feature_cols': feature_cols,
    'user_profiles': user_profiles,
    'user_similarity': user_similarity_df,
    'post_embeddings': post_embed_df,
    'learner': learner,
    'performance': {'accuracy': acc, 'auc': auc, 'f1': f1},
    'features_included': {
        'nlp_understanding': True,
        'user_preferences': True,
        'temporal_weighting': True,
        'cold_start': True,
        'collaborative_filtering': True,
        'real_time_learning': True,
        'ranking_model': True
    }
}

joblib.dump(artifacts, 'GlobalBene_recommendation_engine.pkl')
print("    Saved to GlobalBene_recommendation_engine.pkl")



 Saving  model...
    Saved to GlobalBene_recommendation_engine.pkl


In [14]:
from google.colab import files

files.download("GlobalBene_recommendation_engine.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
print(f"\nAccuracy: {acc*100:.2f}% | AUC: {auc:.4f} | F1: {f1:.4f}")


Accuracy: 96.89% | AUC: 0.9965 | F1: 0.9708
