In [4]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Plot styling configuration
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [10, 6]

print("✅ Libraries imported successfully.")

  from .autonotebook import tqdm as notebook_tqdm


✅ Libraries imported successfully.


In [7]:
class AdvancedMessageMetrics:
    def __init__(self):
        print("⏳ Loading Sentiment Model (XLM-RoBERTa)...")
        # Using CPU (device=-1). If you have a GPU, change to device=0
        self.sentiment_pipe = pipeline(
            "sentiment-analysis",
            model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
            tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment",
            top_k=None,
            truncation=True,
            max_length=512,
            device="mps"
        )

        print("⏳ Loading Toxicity Model (Multilingual Toxic XLM-R)...")
        self.toxic_pipe = pipeline(
            "text-classification",
            model="unitary/multilingual-toxic-xlm-roberta",
            tokenizer="unitary/multilingual-toxic-xlm-roberta",
            top_k=None,
            truncation=True,
            max_length=512,
            device=-1
        )
        print("✅ Models loaded!")

    def _get_sentiment_score(self, text):
        """Converts model output into a number from -1.0 (Negative) to 1.0 (Positive)."""
        try:
            results = self.sentiment_pipe(text)[0]
            scores = {item['label'].lower(): item['score'] for item in results}

            # Normalize label names (sometimes models return label_0/1/2)
            if 'label_0' in scores:
                scores['negative'] = scores.pop('label_0')
                scores['neutral'] = scores.pop('label_1')
                scores['positive'] = scores.pop('label_2')

            # Formula: Positive minus Negative
            return scores.get('positive', 0.0) - scores.get('negative', 0.0)
        except Exception as e:
            print(f"Error in sentiment: {e}")
            return 0.0

    def _get_toxicity_score(self, text):
        """Returns toxicity probability (0.0 - 1.0)."""
        try:
            results = self.toxic_pipe(text)[0]
            scores = {item['label']: item['score'] for item in results}
            # We take the general 'toxic' score
            return scores.get('toxic', 0.0)
        except Exception as e:
            print(f"Error in toxicity: {e}")
            return 0.0

    def calculate_ucqs(self, comments_content):
        """
        Calculates Unified Comment Quality Score (UCQS).
        Input: List of comment texts or string representation of a list.
        Output: Float 0-100.
        """
        # 1. Validation and parsing of input data
        if isinstance(comments_content, str):
            try:
                comments_content = ast.literal_eval(comments_content)
            except:
                return 50.0 # Return neutral baseline on parse error

        if not isinstance(comments_content, list) or not comments_content:
            return 50.0

        sentiments = []
        toxicity_scores = []

        # 2. Analyze each comment
        for text in comments_content:
            if not isinstance(text, str) or not text.strip():
                continue

            # Truncate to 512 chars for speed
            short_text = text[:512]

            sentiments.append(self._get_sentiment_score(short_text))
            toxicity_scores.append(self._get_toxicity_score(short_text))

        if not sentiments:
            return 50.0

        # 3. Aggregating statistics
        s_avg = np.mean(sentiments)      # Average sentiment (-1..1)
        s_var = np.std(sentiments)       # Sentiment variance (0..1)
        t_avg = np.mean(toxicity_scores) # Average toxicity (0..1)

        # 4. Final UCQS Formula
        # Base 50 + (Sentiment * 40) - (Variance * 20) - (Toxicity * 30)
        ucqs = 50 + (s_avg * 40) - (s_var * 20) - (t_avg * 30)

        return float(np.clip(ucqs, 0, 100))

    def calculate_efficiency(self, score, num_comments, upvote_ratio):
        """Calculates Adjusted Efficiency Score (Virality)."""
        # Handle null values
        score = score if pd.notnull(score) else 0
        comments = num_comments if pd.notnull(num_comments) else 0
        ratio = upvote_ratio if pd.notnull(upvote_ratio) else 0.5

        # Comments are weighted higher (x2) as they require more effort
        raw_engagement = score + (comments * 2.0)

        # Logarithm smooths out outliers (million-view posts)
        log_engagement = np.log1p(raw_engagement)

        return round(log_engagement * ratio, 2)

In [None]:
df = pd.read_csv("/Users/ivantyshchenko/Projects/Python/DataPatron/data/reddit.csv")
metrics_engine = AdvancedMessageMetrics()

df['Efficiency'] = df.apply(
    lambda x: metrics_engine.calculate_efficiency(
        x['score'], x['num_comments'], x['upvote_ratio']
    ), axis=1
)

# 2. Calculate UCQS (this may take time depending on the number of comments)
df['UCQS'] = df['comments_content'].apply(metrics_engine.calculate_ucqs)


⏳ Loading Sentiment Model (XLM-RoBERTa)...
