<a href="https://colab.research.google.com/github/yuugiouduele/AImodel/blob/main/SNS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
import networkx as nx

# BERT関連のライブラリをインポート
from transformers import BertForSequenceClassification, BertJapaneseTokenizer
from torch.optim import AdamW # ここを変更
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# オプション：進捗バー表示
from tqdm.notebook import tqdm


# --- 1. Data Acquisition Layer (DUMMY DATA GENERATION) ---
# ※ この部分は変更しないため、省略します。前回のコードをご参照ください。
def get_youtube_comments_and_videos(query, max_results=100):
    # ... (前回のコードからコピー)
    dummy_data = []
    for i in range(max_results):
        num_comments = np.random.randint(0, 50)
        comments_list = [{'comment_id': f'yt_comm_{i}_{j}', 'text': f'Great video {j} about {query}!', 'likes': np.random.randint(0, 50)} for j in range(num_comments)]

        dummy_data.append({
            'platform': 'YouTube',
            'content_id': f'yt_video_{i}_{query.replace(" ", "_")}',
            'content_type': 'video',
            'publish_timestamp': (datetime.now() - timedelta(days=np.random.randint(0, 30), hours=np.random.randint(0,24))).isoformat(),
            'author_id': f'yt_user_{np.random.randint(1, 100)}',
            'author_followers': np.random.randint(100, 100000),
            'content_text': f'This is a sample video content about {query} number {i}. This is a long text to simulate real data.',
            'likes_count': np.random.randint(10, 5000),
            'comments_count': num_comments,
            'views_count': np.random.randint(100, 1000000),
            'shares_count': np.random.randint(0, 200),
            'comment_likes': np.random.randint(0, 50), # Dummy for main content's aggregated comment likes
            'comments': comments_list # Actual comments data
        })
    return dummy_data

def get_x_tweets(query, max_results=100):
    # ... (前回のコードからコピー)
    dummy_data = []
    for i in range(max_results):
        num_comments = np.random.randint(0, 30) # X 'comments' are replies

        dummy_data.append({
            'platform': 'X',
            'content_id': f'x_tweet_{i}_{query.replace(" ", "_")}',
            'content_type': 'tweet',
            'publish_timestamp': (datetime.now() - timedelta(hours=np.random.randint(0, 72), minutes=np.random.randint(0,60))).isoformat(),
            'author_id': f'x_user_{np.random.randint(1, 200)}',
            'author_followers': np.random.randint(50, 50000),
            'content_text': f'Just tweeted about {query} topic {i}. #trend #topic. This tweet is quite engaging.',
            'likes_count': np.random.randint(5, 1000),
            'comments_count': num_comments, # X API v2 has 'reply_count'
            'retweets_count': np.random.randint(0, 150),
            'shares_count': np.random.randint(0, 150), # Shares are often retweets/quotes on X
            'quote_tweets_count': np.random.randint(0, 50),
            'views_count': np.random.randint(1000, 500000), # Dummy for X views
            'comment_likes': np.random.randint(0, 30), # Dummy for comments likes
            'comments': [] # X API v2 comments are separate tweets with 'in_reply_to_tweet_id', not nested here
        })
    return dummy_data

def get_instagram_posts(query, max_results=100):
    # ... (前回のコードからコピー)
    dummy_data = []
    for i in range(max_results):
        num_comments = np.random.randint(0, 40)
        comments_list = [{'comment_id': f'insta_comm_{i}_{j}', 'text': f'Love this! {j} #insta. Very cool content.', 'likes': np.random.randint(0, 20)} for j in range(num_comments)]

        dummy_data.append({
            'platform': 'Instagram',
            'content_id': f'insta_post_{i}_{query.replace(" ", "_")}',
            'content_type': 'image',
            'publish_timestamp': (datetime.now() - timedelta(days=np.random.randint(0, 60), hours=np.random.randint(0,24))).isoformat(),
            'author_id': f'insta_user_{np.random.randint(1, 50)}',
            'author_followers': np.random.randint(200, 200000),
            'content_text': f'Beautiful post about {query} {i}. #instadaily. This photo captures the essence.',
            'likes_count': np.random.randint(20, 10000),
            'comments_count': num_comments,
            'views_count': np.random.randint(500, 500000), # For video posts
            'shares_count': np.random.randint(0, 100), # Dummy for Instagram shares (e.g., direct messages)
            'comment_likes': np.random.randint(0, 20), # Dummy for comment likes
            'comments': comments_list # Actual comments data
        })
    return dummy_data

def collect_all_data(queries, max_results_per_platform=100):
    """
    Aggregates dummy data from all simulated platforms.
    Flattens comments into the main DataFrame for processing.
    """
    all_data = []
    for query in queries:
        all_data.extend(get_youtube_comments_and_videos(query, max_results_per_platform))
        all_data.extend(get_x_tweets(query, max_results_per_platform))
        all_data.extend(get_instagram_posts(query, max_results_per_platform))

    flat_data = []
    for item in all_data:
        # For original posts, use their own content_id and text
        item_copy_for_main = item.copy()
        item_copy_for_main['is_comment'] = False
        item_copy_for_main['comment_text'] = item_copy_for_main['content_text'] # Treat main content as its own "comment" for sentiment etc.
        item_copy_for_main['comment_likes'] = item_copy_for_main.get('likes_count', 0) # Use main likes as a proxy
        flat_data.append(item_copy_for_main)

        # For actual comments associated with the content
        comments = item.pop('comments', []) # Extract comments and remove from original dict
        if comments:
            for comment in comments:
                new_item = item.copy() # Copy original item without its comments list
                new_item['content_id'] = comment['comment_id'] # Use comment ID as content_id for comment rows
                new_item['content_text'] = comment['text']
                new_item['comment_likes'] = comment.get('likes', 0)
                new_item['is_comment'] = True
                flat_data.append(new_item)

    df = pd.DataFrame(flat_data)
    df['publish_timestamp'] = pd.to_datetime(df['publish_timestamp'])
    return df

# --- 2. Feature Engineering Layer ---
def extract_features(df):
    print("--- 2. Extracting Features ---")

    # Time-based features
    df['hour_of_day'] = df['publish_timestamp'].dt.hour
    df['day_of_week'] = df['publish_timestamp'].dt.dayofweek # Monday=0, Sunday=6
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

    # Sentiment & Toxicity Features (using simple random values as placeholders)
    df['sentiment_score'] = np.random.uniform(-1.0, 1.0, len(df)) # -1 (negative) to 1 (positive)
    df['toxicity_score'] = np.random.uniform(0.0, 1.0, len(df))  # 0 (not toxic) to 1 (highly toxic)

    # Trend features (dummy calculation based on recent engagement)
    time_diff_seconds = (datetime.now() - df['publish_timestamp']).dt.total_seconds().replace(0, 1e-6)
    df['trend_score'] = (df['likes_count'] + df['comments_count'] + df['shares_count']) / time_diff_seconds * 1e9
    df['trend_score'] = df['trend_score'].fillna(0)

    # Monetization (金) - Highly speculative, using a proxy
    df['monetization_potential'] = df['author_followers'] * (df['likes_count'] + df['comments_count'] + df['shares_count'])
    df['monetization_potential'] = df['monetization_potential'].fillna(0)

    # --- Dummy Trend Category Prediction "Actual Label" ---
    # Define a "true" trend label based on a threshold of the calculated trend_score
    # This will be the ground truth for BERT model's evaluation.
    # We apply this threshold to unique content to avoid bias from multiple comment rows.
    temp_trend_scores_unique = df.drop_duplicates(subset='content_id')['trend_score']
    if not temp_trend_scores_unique.empty:
        trend_threshold = temp_trend_scores_unique.quantile(0.7) # Top 30% are "high_trend"
        # Map this back to the original (potentially duplicated) DataFrame rows
        df['actual_trend_category'] = (df['trend_score'] > trend_threshold).astype(int) # 1 for high, 0 for low
    else:
        df['actual_trend_category'] = 0 # Default if no data


    # --- Dummy Age Group Popularity ---
    age_groups = ['Teens', '20s', '30s', '40s+', 'All_Ages']
    df['age_group_popularity'] = [
        {group: np.random.uniform(0, 1) for group in age_groups}
        for _ in range(len(df))
    ]
    df['dominant_age_group'] = np.random.choice(age_groups, size=len(df))

    return df

# --- 3. AI Model for Analysis & Ranking Layer ---
def analyze_and_rank(
    df,
    bert_batch_size=16,
    bert_num_epochs=3,
    bert_max_seq_len=128, # BERTの最大入力シーケンス長
    bert_learning_rate=2e-5, # BERTファインチューニングの学習率
    bert_validation_split=0.2, # 検証用データの分割比率
    bert_model_name='cl-tohoku/bert-base-japanese-whole-word-masking', # 日本語BERTモデルの指定
    # ノイズやバイアス削減はBERTの学習設定やデータ拡張、損失関数設計で実現されるため、直接的な引数としては扱わないが、概念は残す
    # bert_noise_level=0.1,
    # bert_bias_reduction_factor=0.0,
    device=None # GPU/CPUデバイス
):
    print("--- 3. Analyzing Data & Ranking Content ---")

    # デバイス設定
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure all required columns exist and are numeric
    required_cols = [
        'publish_timestamp', 'content_id', 'platform', 'author_id', 'likes_count',
        'comments_count', 'views_count', 'shares_count', 'author_followers',
        'hour_of_day', 'day_of_week', 'is_weekend', 'comment_likes',
        'sentiment_score', 'toxicity_score', 'trend_score', 'monetization_potential',
        'actual_trend_category', 'age_group_popularity', 'dominant_age_group', 'content_text'
    ]
    for col in required_cols:
        if col not in df.columns:
            if col in ['age_group_popularity']:
                df[col] = [{} for _ in range(len(df))]
            elif col in ['content_text', 'content_id', 'platform', 'author_id', 'dominant_age_group']:
                df[col] = '' # Initialize string columns as empty
            else:
                df[col] = 0.0
        if col not in ['age_group_popularity', 'dominant_age_group', 'content_id', 'platform', 'author_id', 'content_text', 'is_comment']:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)

    # --- Step 3.1: Clustering based on Composite Scores ---
    print("\n--- Performing Clustering on Composite Scores ---")

    # We only cluster unique content for meaningful groups
    df_unique_content_for_clustering = df.drop_duplicates(subset='content_id').copy()

    # Ensure necessary columns for clustering are numeric
    clustering_score_cols = ['empathy_score', 'awareness_score', 'popularity_score', 'virality_score']
    for col in clustering_score_cols:
        if col not in df_unique_content_for_clustering.columns:
             df_unique_content_for_clustering[col] = 0.0
        df_unique_content_for_clustering[col] = pd.to_numeric(df_unique_content_for_clustering[col], errors='coerce').fillna(0.0)

    # First calculate composite scores for unique content to use in clustering
    scaler = MinMaxScaler()

    # Calculate engagement_rate for Popularity Score (ensure no division by zero)
    df_unique_content_for_clustering['engagement_rate'] = df_unique_content_for_clustering['likes_count'] / df_unique_content_for_clustering['author_followers'].replace(0, 1)
    df_unique_content_for_clustering['engagement_rate'] = df_unique_content_for_clustering['engagement_rate'].fillna(0.0)

    # Dummy pagerank_score if not computed in graph section yet
    if 'pagerank_score' not in df_unique_content_for_clustering.columns:
        df_unique_content_for_clustering['pagerank_score'] = np.random.uniform(0.01, 1.0, len(df_unique_content_for_clustering)) # Placeholder

    # Ensure all required columns for scaling exist in df_unique_content_for_clustering
    score_calculation_cols_unique = [
        'likes_count', 'comments_count', 'views_count', 'shares_count',
        'author_followers', 'sentiment_score', 'trend_score',
        'monetization_potential', 'pagerank_score', 'comment_likes'
    ]
    for col in score_calculation_cols_unique:
        if col not in df_unique_content_for_clustering.columns:
            df_unique_content_for_clustering[col] = 0.0
        df_unique_content_for_clustering[col] = pd.to_numeric(df_unique_content_for_clustering[col], errors='coerce').fillna(0.0)

    cols_to_scale_unique = score_calculation_cols_unique + ['engagement_rate']
    existing_cols_to_scale_unique = [col for col in cols_to_scale_unique if col in df_unique_content_for_clustering.columns]

    df_scaled_values_unique = scaler.fit_transform(df_unique_content_for_clustering[existing_cols_to_scale_unique])
    df_scaled_unique = pd.DataFrame(df_scaled_values_unique, columns=existing_cols_to_scale_unique, index=df_unique_content_for_clustering.index)

    # Calculate composite scores for unique content
    df_unique_content_for_clustering['empathy_score'] = (0.6 * df_scaled_unique['sentiment_score'] + 0.2 * np.log1p(df_unique_content_for_clustering['comments_count']) + 0.2 * np.log1p(df_unique_content_for_clustering['comment_likes']))
    df_unique_content_for_clustering['empathy_score'] = scaler.fit_transform(df_unique_content_for_clustering[['empathy_score']])
    df_unique_content_for_clustering['awareness_score'] = (0.4 * np.log1p(df_unique_content_for_clustering['views_count']) + 0.3 * np.log1p(df_unique_content_for_clustering['author_followers']) + 0.3 * df_scaled_unique['trend_score'])
    df_unique_content_for_clustering['awareness_score'] = scaler.fit_transform(df_unique_content_for_clustering[['awareness_score']])
    df_unique_content_for_clustering['popularity_score'] = (0.5 * np.log1p(df_unique_content_for_clustering['likes_count']) + 0.3 * df_scaled_unique['engagement_rate'] + 0.2 * df_scaled_unique['pagerank_score'])
    df_unique_content_for_clustering['popularity_score'] = scaler.fit_transform(df_unique_content_for_clustering[['popularity_score']])
    df_unique_content_for_clustering['virality_score'] = (0.6 * np.log1p(df_unique_content_for_clustering['shares_count']) + 0.4 * np.log1p(df_unique_content_for_clustering['comments_count']))
    df_unique_content_for_clustering['virality_score'] = scaler.fit_transform(df_unique_content_for_clustering[['virality_score']])

    # Perform Clustering
    clustering_features_final = df_unique_content_for_clustering[clustering_score_cols].dropna()

    if len(clustering_features_final) >= 2:
        n_clusters = min(4, len(clustering_features_final))
        if n_clusters >= 2:
            kmeans_final = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
            df_unique_content_for_clustering.loc[clustering_features_final.index, 'content_cluster'] = kmeans_final.fit_predict(clustering_features_final)

            print(f"\nContent Cluster Sizes (K={n_clusters}):\n{df_unique_content_for_clustering['content_cluster'].value_counts().to_string()}")
            print("\nAverage Scores per Cluster:")
            cluster_summary = df_unique_content_for_clustering.groupby('content_cluster')[['empathy_score', 'awareness_score', 'popularity_score', 'virality_score', 'likes_count', 'comments_count', 'views_count']].mean()
            print(cluster_summary.to_string())
        else:
            print("Not enough unique data points for meaningful clustering (need at least 2 samples for clustering).")
            df_unique_content_for_clustering['content_cluster'] = -1
    else:
        print("Not enough data points for meaningful clustering on composite scores (need at least 2 unique samples).")
        df_unique_content_for_clustering['content_cluster'] = -1

    # Merge content_cluster back to the original df (which contains comment rows)
    df = df.merge(df_unique_content_for_clustering[['content_id', 'content_cluster']], on='content_id', how='left', suffixes=('', '_y'))
    df['content_cluster'] = df['content_cluster'].fillna(-1).astype(int) # -1 for content that wasn't unique enough for clustering


    # --- BERT Model Trend Prediction ---
    print("\n--- BERT Model Trend Prediction ---")

    # BERTに投入するデータフレームを準備
    # トレンド予測はユニークなコンテンツに対して行うため、コメント行は除外
    df_for_bert = df.drop_duplicates(subset='content_id').copy()

    # 欠損値を含む行を除外 (特にcontent_textとactual_trend_category)
    df_for_bert = df_for_bert.dropna(subset=['content_text', 'actual_trend_category', 'content_cluster'])

    if df_for_bert.empty or len(df_for_bert['actual_trend_category'].unique()) < 2:
        print("Not enough valid data or classes for BERT prediction. Skipping BERT training.")
        df['predicted_trend_category'] = 0 # デフォルト値
        if 'pagerank_score' not in df.columns:
            df['pagerank_score'] = np.random.uniform(0.01, 1.0, len(df))
    else:
        # X: テキストとクラスタIDをBERT入力用に加工
        # 簡易的な方法として、クラスタIDをテキストに追記する形で結合します。
        # 例: "[CLS] 本文 [SEP] クラスタ: 0 [SEP]"
        # より高度な方法では、クラスタIDをEmbeddingとして取得し、BERTの最終層の前にConcatenateするなどがあります。
        texts = [f"{row['content_text']} [SEP] クラスタ: {row['content_cluster']}" for idx, row in df_for_bert.iterrows()]
        labels = df_for_bert['actual_trend_category'].values

        # データの分割
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            texts, labels, test_size=bert_validation_split, random_state=42, stratify=labels
        )

        print(f"Loading BERT model: {bert_model_name}")
        tokenizer = BertJapaneseTokenizer.from_pretrained(bert_model_name)
        model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=2) # トレンドカテゴリは2クラス (高/低)
        model.to(device)

        # トークン化とID化
        def tokenize_data(texts, tokenizer, max_len):
            input_ids = []
            attention_masks = []
            for text in texts:
                encoded_dict = tokenizer.encode_plus(
                                    text,                      # 入力テキスト
                                    add_special_tokens = True, # [CLS] [SEP] を追加
                                    max_length = max_len,      # 最大長
                                    padding = 'max_length',    # パディングを適用
                                    truncation = True,         # トランケーションを適用
                                    return_attention_mask = True, # アテンションマスクを返す
                                    return_tensors = 'pt',     # PyTorchテンソルを返す
                               )
                input_ids.append(encoded_dict['input_ids'])
                attention_masks.append(encoded_dict['attention_mask'])
            return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

        train_input_ids, train_attention_masks = tokenize_data(train_texts, tokenizer, bert_max_seq_len)
        val_input_ids, val_attention_masks = tokenize_data(val_texts, tokenizer, bert_max_seq_len)

        train_labels_tensor = torch.tensor(train_labels)
        val_labels_tensor = torch.tensor(val_labels)

        # DataLoaderの作成
        train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels_tensor)
        val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels_tensor)

        train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = bert_batch_size
        )
        validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = bert_batch_size
        )

        # オプティマイザ設定
        optimizer = AdamW(model.parameters(), lr = bert_learning_rate)

        # --- BERTモデルのファインチューニング (学習) ---
        print("\nStarting BERT fine-tuning...")
        for epoch_i in range(0_0, bert_num_epochs):
            print(f"\n======== Epoch {epoch_i + 1} / {bert_num_epochs} ========")
            print("Training...")

            total_train_loss = 0
            model.train() # モデルを訓練モードに設定

            for step, batch in enumerate(tqdm(train_dataloader)):
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                model.zero_grad() # 勾配をクリア

                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)

                loss = outputs.loss
                total_train_loss += loss.item()

                loss.backward() # 誤差逆伝播
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 勾配クリッピング
                optimizer.step() # パラメータ更新

            avg_train_loss = total_train_loss / len(train_dataloader)
            print(f"  Average training loss: {avg_train_loss:.2f}")

            # --- 検証 ---
            print("Running Validation...")
            model.eval() # モデルを評価モードに設定

            total_eval_accuracy = 0
            total_eval_f1 = 0
            predictions, true_labels = [], []

            for batch in tqdm(validation_dataloader):
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                with torch.no_grad(): # 勾配計算を無効化
                    outputs = model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask)

                logits = outputs.logits
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                preds = np.argmax(logits, axis=1).flatten()

                predictions.extend(preds)
                true_labels.extend(label_ids)

            # 評価指標の計算
            eval_accuracy = accuracy_score(true_labels, predictions)
            eval_f1 = f1_score(true_labels, predictions, average='weighted', zero_division=0)

            print(f"  Accuracy: {eval_accuracy:.4f}")
            print(f"  F1 Score (weighted): {eval_f1:.4f}")

        print("\nBERT fine-tuning complete.")

        # --- BERTモデルによる最終予測 ---
        print("\nMaking final predictions with fine-tuned BERT model...")
        model.eval() # 評価モード

        all_texts = [f"{row['content_text']} [SEP] クラスタ: {row['content_cluster']}" for idx, row in df_for_bert.iterrows()]
        all_input_ids, all_attention_masks = tokenize_data(all_texts, tokenizer, bert_max_seq_len)

        all_dataset = TensorDataset(all_input_ids, all_attention_masks)
        all_dataloader = DataLoader(
            all_dataset,
            sampler = SequentialSampler(all_dataset),
            batch_size = bert_batch_size
        )

        bert_predicted_trend_categories = []
        for batch in tqdm(all_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)

            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            logits = outputs.logits
            preds = np.argmax(logits.detach().cpu().numpy(), axis=1).flatten()
            bert_predicted_trend_categories.extend(preds)

        # 予測結果をDataFrameにマージ
        df_bert_predictions = pd.DataFrame({
            'content_id': df_for_bert['content_id'],
            'predicted_trend_category': bert_predicted_trend_categories
        })

        df = df.merge(df_bert_predictions, on='content_id', how='left', suffixes=('', '_bert_pred'))
        df['predicted_trend_category'] = df['predicted_trend_category'].fillna(0).astype(int) # 予測できなかったものは低トレンド(0)とする

    # --- Graph-based Analysis (Nodes Importance) ---
    print("\n--- Graph-based Node Importance (PageRank) ---")
    # ※ この部分は変更しないため、省略します。前回のコードをご参照ください。
    B = nx.Graph()
    authors = df['author_id'].unique()
    content_ids = df['content_id'].unique()

    if len(authors) > 0 and len(content_ids) > 0:
        B.add_nodes_from(authors, bipartite=0) # Authors
        B.add_nodes_from(content_ids, bipartite=1) # Content

        edges_for_bipartite = []
        unique_content_author_pairs = df[['content_id', 'author_id']].drop_duplicates()
        for _, row in unique_content_author_pairs.iterrows():
            edges_for_bipartite.append((row['author_id'], row['content_id']))
        B.add_edges_from(edges_for_bipartite)

        # Project the graph onto the content nodes
        G_content = nx.bipartite.projected_graph(B, content_ids)

        if len(G_content.nodes()) > 1:
            try:
                pagerank_scores = nx.pagerank(G_content, alpha=0.85)

                content_pagerank_df = pd.DataFrame.from_dict(pagerank_scores, orient='index', columns=['pagerank_score'])
                content_pagerank_df.index.name = 'content_id'

                df = df.merge(content_pagerank_df, on='content_id', how='left')
                df['pagerank_score'] = df['pagerank_score'].fillna(0.0)

                print("\nTop 10 Content by PageRank Score (based on content-author interaction):")
                unique_content_pagerank_display = df.drop_duplicates(subset='content_id').sort_values(by='pagerank_score', ascending=False)
                print(unique_content_pagerank_display[['platform', 'content_id', 'content_text', 'likes_count', 'pagerank_score']].head(10).to_string())
            except Exception as e:
                print(f"Could not calculate PageRank: {e}. Graph might be disconnected or too small.")
                df['pagerank_score'] = np.random.uniform(0.01, 1.0, len(df)) # Fallback dummy
        else:
            print("Not enough content nodes to build a meaningful graph for PageRank.")
            df['pagerank_score'] = np.random.uniform(0.01, 1.0, len(df)) # Fallback dummy
    else:
        print("No authors or content found to build graph for PageRank.")
        df['pagerank_score'] = np.random.uniform(0.01, 1.0, len(df)) # Fallback dummy


    print("\n--- Recalculating Composite Scores (after PageRank) ---")

    # Ensure required columns for score calculation are numeric and handle NaNs
    score_calculation_cols = [
        'likes_count', 'comments_count', 'views_count', 'shares_count',
        'author_followers', 'sentiment_score', 'trend_score',
        'monetization_potential', 'pagerank_score', 'comment_likes'
    ]

    for col in score_calculation_cols:
        if col not in df.columns:
            df[col] = 0.0
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)

    df['engagement_rate'] = df['likes_count'] / df['author_followers'].replace(0, 1)
    df['engagement_rate'] = df['engagement_rate'].fillna(0.0)

    cols_to_scale = score_calculation_cols + ['engagement_rate']
    existing_cols_to_scale = [col for col in cols_to_scale if col in df.columns]

    scaler = MinMaxScaler()
    df_scaled_values = scaler.fit_transform(df[existing_cols_to_scale])
    df_scaled = pd.DataFrame(df_scaled_values, columns=existing_cols_to_scale, index=df.index)

    # 1. 共感スコア (Empathy/Sentiment Score)
    df['empathy_score'] = (
        0.6 * df_scaled['sentiment_score'] +
        0.2 * np.log1p(df['comments_count']) +
        0.2 * np.log1p(df['comment_likes'])
    )
    df['empathy_score'] = scaler.fit_transform(df[['empathy_score']])

    # 2. 認知スコア (Awareness/Visibility Score)
    df['awareness_score'] = (
        0.4 * np.log1p(df['views_count']) +
        0.3 * np.log1p(df['author_followers']) +
        0.3 * df_scaled['trend_score']
    )
    df['awareness_score'] = scaler.fit_transform(df[['awareness_score']])

    # 3. 人気スコア (Popularity Score)
    df['popularity_score'] = (
        0.5 * np.log1p(df['likes_count']) +
        0.3 * df_scaled['engagement_rate'] +
        0.2 * df_scaled['pagerank_score']
    )
    df['popularity_score'] = scaler.fit_transform(df[['popularity_score']])

    # 4. 拡散スコア (Virality/Spread Score)
    df['virality_score'] = (
        0.6 * np.log1p(df['shares_count']) +
        0.4 * np.log1p(df['comments_count'])
    )
    df['virality_score'] = scaler.fit_transform(df[['virality_score']])

    print("Calculated Composite Scores (first 5 rows):")
    print(df[['empathy_score', 'awareness_score', 'popularity_score', 'virality_score']].head().to_string())


    # --- Final Ranking with Age Group Popularity ---
    print("\n--- Final Content Ranking ---")

    # For ranking, we need to consider unique content, not individual comment rows if they were duplicated.
    # Use the df_unique_content_for_clustering which already has composite scores and clusters.
    # We need to merge the BERT predicted trend category here.
    df_unique_content_ranked = df.drop_duplicates(subset='content_id').copy()

    # Ensure predicted_trend_category is available
    if 'predicted_trend_category' not in df_unique_content_ranked.columns:
        df_unique_content_ranked['predicted_trend_category'] = 0 # Default if BERT was skipped

    # Add age group popularity to ranking.
    df_unique_content_ranked['avg_age_popularity'] = df_unique_content_ranked['age_group_popularity'].apply(
        lambda x: np.mean(list(x.values())) if x and len(x) > 0 else 0
    )

    # Use the BERT predicted trend category for the final ranking score
    # predicted_trend_category (0 or 1) を重みとして使用
    df_unique_content_ranked['final_ranking_score'] = (
        0.4 * df_unique_content_ranked['predicted_trend_category'] * df_unique_content_ranked['trend_score'] + # BERT予測を重視
        0.3 * df_unique_content_ranked['popularity_score'] +
        0.2 * df_unique_content_ranked['empathy_score'] +
        0.05 * df_unique_content_ranked['monetization_potential'] +
        0.05 * df_unique_content_ranked['avg_age_popularity']
    )

    # Re-normalize final_ranking_score to 0-1 for better interpretability
    # Ensure there's variation for scaling
    if df_unique_content_ranked['final_ranking_score'].nunique() > 1:
        df_unique_content_ranked['final_ranking_score'] = scaler.fit_transform(df_unique_content_ranked[['final_ranking_score']])
    else:
        df_unique_content_ranked['final_ranking_score'] = 0.0 # If all scores are the same

    top_ranked_content = df_unique_content_ranked.sort_values(by='final_ranking_score', ascending=False)

    print("\nTop 10 Content Ranking (considering BERT Predicted Trend, Popularity, Empathy, Monetization, Age Appeal):")
    print(top_ranked_content[['platform', 'content_id', 'content_text', 'likes_count',
                             'empathy_score', 'popularity_score', 'virality_score',
                             'trend_score', 'predicted_trend_category', 'dominant_age_group',
                             'final_ranking_score']].head(10).to_string())

    print("\nTop 10 Content by Dislike/Toxicity (Potential '嫌悪度' Hotspots):")
    hated_content = df_unique_content_ranked.sort_values(by='toxicity_score', ascending=False)
    print(hated_content[['platform', 'content_id', 'content_text', 'likes_count', 'comments_count', 'toxicity_score']].head(10).to_string())

    # --- Popularity Measurement by Age Group ---
    print("\n--- Popularity Measurement by Age Group ---")
    age_group_summary = pd.DataFrame(columns=['Age_Group', 'Avg_Popularity_Score', 'Avg_Likes', 'Content_Count'])

    for group in df_unique_content_ranked['dominant_age_group'].unique():
        group_df = df_unique_content_ranked[df_unique_content_ranked['dominant_age_group'] == group]
        if not group_df.empty:
            avg_pop_score = group_df['popularity_score'].mean()
            avg_likes = group_df['likes_count'].mean()
            content_count = len(group_df)
            age_group_summary = pd.concat([age_group_summary, pd.DataFrame([{'Age_Group': group, 'Avg_Popularity_Score': avg_pop_score, 'Avg_Likes': avg_likes, 'Content_Count': content_count}])], ignore_index=True)

    print(age_group_summary.sort_values(by='Avg_Popularity_Score', ascending=False).to_string())

    return top_ranked_content

# --- Main Execution Flow ---
if __name__ == "__main__":
    search_queries = ["AI Technology", "Space Exploration", "Gaming News", "Fashion Trends", "Healthy Eating", "Cooking Recipes"]

    total_dummy_records = 20000
    max_results_per_platform = 60

    print(f"--- Simulating data collection aiming for ~{total_dummy_records} total records ---")
    raw_data_df = collect_all_data(search_queries, max_results_per_platform=max_results_per_platform)
    print(f"\nTotal collected data points (simulated): {len(raw_data_df)}")
    print("\nRaw Data Head:")
    print(raw_data_df.head().to_string())

    # Step 2: Extract Features
    processed_df = extract_features(raw_data_df.copy())
    print("\nProcessed Data with Features Head:")
    print(processed_df.head().to_string())

    # Step 3: Analyze and Rank with BERT parameters
    # BERTモデルの学習にはGPUが推奨されます。利用可能な場合は"cuda"、なければ"cpu"が自動選択されます。
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    final_ranked_results = analyze_and_rank(
        processed_df.copy(),
        bert_batch_size=32,          # BERT学習時のバッチサイズ
        bert_num_epochs=10,           # BERT学習のエポック数（データ量に応じて調整）
        bert_max_seq_len=128,        # BERT入力の最大トークン長
        bert_learning_rate=2e-5,     # BERTファインチューニングの学習率
        bert_validation_split=0.2,   # 検証用データの分割比率
        bert_model_name='cl-tohoku/bert-base-japanese-whole-word-masking', # 東北大学の日本語BERT
        device=device                # 使用デバイス
    )

    print("\n--- Analysis Complete ---")
    print("Further steps would involve: ")
    print("1. Implementing robust, legal, and ethical API data collection with proper authentication and error handling.")
    print("2. Fine-tuning BERT for more specific trend characteristics or multi-class classification.")
    print("3. Developing more sophisticated monetization proxies and validating them with business data.")
    print("4. Advanced graph construction (e.g., reply trees, mention networks, temporal graphs) and real GNN application.")
    print("5. Building a user interface for interactive visualization and exploration of clusters and trends.")
    print("6. Implementing A/B testing and continuous evaluation for prediction models.")

--- Simulating data collection aiming for ~20000 total records ---

Total collected data points (simulated): 16464

Raw Data Head:
  platform                content_id content_type          publish_timestamp   author_id  author_followers                                                                                             content_text  likes_count  comments_count  views_count  shares_count  comment_likes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting BERT fine-tuning...

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.59
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7245
  F1 Score (weighted): 0.6571

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.56
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7245
  F1 Score (weighted): 0.6571

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.56
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7263
  F1 Score (weighted): 0.6675

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.55
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7263
  F1 Score (weighted): 0.6436

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.55
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7245
  F1 Score (weighted): 0.6670

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.54
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7219
  F1 Score (weighted): 0.6651

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.54
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7174
  F1 Score (weighted): 0.6675

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.54
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7166
  F1 Score (weighted): 0.6303

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.54
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7112
  F1 Score (weighted): 0.6326

Training...


  0%|          | 0/141 [00:00<?, ?it/s]

  Average training loss: 0.53
Running Validation...


  0%|          | 0/36 [00:00<?, ?it/s]

  Accuracy: 0.7192
  F1 Score (weighted): 0.6580

BERT fine-tuning complete.

Making final predictions with fine-tuned BERT model...


  0%|          | 0/177 [00:00<?, ?it/s]


--- Graph-based Node Importance (PageRank) ---

Top 10 Content by PageRank Score (based on content-author interaction):
    platform    content_id                        content_text  likes_count  pagerank_score
189  YouTube  yt_comm_10_0  Great video 0 about AI Technology!          703        0.000396
190  YouTube  yt_comm_10_1  Great video 1 about AI Technology!          703        0.000396
191  YouTube  yt_comm_10_2  Great video 2 about AI Technology!          703        0.000396
192  YouTube  yt_comm_10_3  Great video 3 about AI Technology!          703        0.000396
193  YouTube  yt_comm_10_4  Great video 4 about AI Technology!          703        0.000396
194  YouTube  yt_comm_10_5  Great video 5 about AI Technology!          703        0.000396
195  YouTube  yt_comm_10_6  Great video 6 about AI Technology!          703        0.000396
196  YouTube  yt_comm_10_7  Great video 7 about AI Technology!          703        0.000396
198  YouTube  yt_comm_10_9  Great video 9 about AI 

  age_group_summary = pd.concat([age_group_summary, pd.DataFrame([{'Age_Group': group, 'Avg_Popularity_Score': avg_pop_score, 'Avg_Likes': avg_likes, 'Content_Count': content_count}])], ignore_index=True)


In [None]:
# 必要なライブラリをまとめてインストール
!pip install transformers sentencepiece accelerate fugashi unidic-lite

Collecting fugashi
  Downloading fugashi-1.5.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu