In [None]:
import os
import warnings
from dataclasses import dataclass
from typing import Tuple, Dict, List

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.utils import Bunch
import joblib

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

warnings.filterwarnings("ignore")

In [None]:
@dataclass
class Config:
    tokens_path: str = "data/tokens.csv"
    comments_path: str = "data/comments.csv"
    images_dir: str = "data/images"     
    test_size: float = 0.15
    val_size: float = 0.15
    random_state: int = 42
    min_comments_for_sentiment: int = 1  


CFG = Config()

In [None]:
def ensure_vader_downloaded():
    try:
        _ = SentimentIntensityAnalyzer()
    except LookupError:
        nltk.download('vader_lexicon')


def load_data(cfg: Config) -> Tuple[pd.DataFrame, pd.DataFrame]:
    tokens = pd.read_csv(cfg.tokens_path)
    comments = pd.read_csv(cfg.comments_path)

    required_token_cols = ["token_id", "description", "max_mcap", "market_entry_time"]
    required_comment_cols = ["token_id", "user_id", "text"]

    for c in required_token_cols:
        if c not in tokens.columns:
            raise ValueError(f"tokens.csv is missing column: {c}")

    for c in required_comment_cols:
        if c not in comments.columns:
            raise ValueError(f"comments.csv is missing column: {c}")

    return tokens, comments

In [None]:
def build_labels(tokens: pd.DataFrame) -> pd.Series:
    tokens = tokens.copy()
    tokens["max_mcap"] = pd.to_numeric(tokens["max_mcap"], errors="coerce")
    median_mcap = tokens["max_mcap"].median()
    labels = (tokens["max_mcap"] >= median_mcap).astype(int)
    return labels


def build_text_features(tokens: pd.DataFrame) -> Tuple[np.ndarray, TfidfVectorizer]:
    descriptions = tokens["description"].fillna("").astype(str)
    vectorizer = TfidfVectorizer(
        max_features=1000,
        ngram_range=(1, 2),
        stop_words="english",
        lowercase=True
    )
    X_text = vectorizer.fit_transform(descriptions)
    return X_text.toarray(), vectorizer

In [None]:
def compute_comment_sentiment(comments: pd.DataFrame) -> pd.DataFrame:
    ensure_vader_downloaded()
    sia = SentimentIntensityAnalyzer()

    comments = comments.copy()
    comments["text"] = comments["text"].fillna("").astype(str)
    scores = comments["text"].apply(lambda t: sia.polarity_scores(t)["compound"])
    comments["sent_score"] = scores

    grouped = comments.groupby("token_id").agg(
        n_comments=("text", "count"),
        n_users=("user_id", pd.Series.nunique),
        sent_mean=("sent_score", "mean"),
        sent_std=("sent_score", "std"),
        sent_pos_ratio=("sent_score", lambda s: (s > 0.05).mean()),
        sent_neg_ratio=("sent_score", lambda s: (s < -0.05).mean())
    ).reset_index()

    grouped["sent_std"] = grouped["sent_std"].fillna(0.0)
    return grouped


def build_community_features(tokens: pd.DataFrame,
                             comments: pd.DataFrame) -> Tuple[np.ndarray, pd.DataFrame]:
    agg = compute_comment_sentiment(comments)
    merged = tokens[["token_id"]].merge(agg, on="token_id", how="left")

    community_cols = ["n_comments", "n_users", "sent_mean", "sent_std",
                      "sent_pos_ratio", "sent_neg_ratio"]
    merged[community_cols] = merged[community_cols].fillna(0.0)

    return merged[community_cols].values, merged[["token_id"] + community_cols]

In [None]:
def build_financial_features(tokens: pd.DataFrame) -> Tuple[np.ndarray, List[str], StandardScaler]:
    feats = tokens[["market_entry_time", "max_mcap"]].copy()
    feats = feats.apply(pd.to_numeric, errors="coerce")
    feats = feats.fillna(feats.median())

    scaler = StandardScaler()
    X_fin = scaler.fit_transform(feats)

    return X_fin, list(feats.columns), scaler


def build_image_features(tokens: pd.DataFrame, cfg: Config) -> Tuple[np.ndarray, List[str]]:
    n = len(tokens)
    d_img = 32
    X_img = np.zeros((n, d_img), dtype=float)
    img_cols = [f"img_feat_{i}" for i in range(d_img)]
    return X_img, img_cols


In [None]:
def train_val_test_split(X: np.ndarray, y: np.ndarray,
                         cfg: Config) -> Dict[str, np.ndarray]:
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y,
        test_size=cfg.test_size,
        random_state=cfg.random_state,
        stratify=y
    )

    val_ratio = cfg.val_size / (1.0 - cfg.test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val,
        test_size=val_ratio,
        random_state=cfg.random_state,
        stratify=y_train_val
    )

    return dict(
        X_train=X_train, y_train=y_train,
        X_val=X_val, y_val=y_val,
        X_test=X_test, y_test=y_test
    )


def train_and_evaluate_models(splits: Dict[str, np.ndarray]) -> Bunch:
    X_train, y_train = splits["X_train"], splits["y_train"]
    X_val, y_val = splits["X_val"], splits["y_val"]
    X_test, y_test = splits["X_test"], splits["y_test"]

    models = {
        "logreg": LogisticRegression(max_iter=500, n_jobs=-1),
        "rf": RandomForestClassifier(
            n_estimators=300, max_depth=None, random_state=CFG.random_state
        ),
        "gb": GradientBoostingClassifier(random_state=CFG.random_state),
    }

    results = []
    best_model_name = None
    best_val_auc = -np.inf
    best_model = None

    for name, model in models.items():
        model.fit(X_train, y_train)

        val_proba = model.predict_proba(X_val)[:, 1]
        val_pred = (val_proba >= 0.5).astype(int)
        val_acc = accuracy_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred)
        try:
            val_auc = roc_auc_score(y_val, val_proba)
        except ValueError:
            val_auc = np.nan

        print(f"\n[{name}] Validation: "
              f"Acc={val_acc:.3f}, F1={val_f1:.3f}, AUC={val_auc:.3f}")

        results.append(dict(
            model=name,
            split="val",
            acc=val_acc,
            f1=val_f1,
            auc=val_auc
        ))

        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_model_name = name
            best_model = model

    test_proba = best_model.predict_proba(X_test)[:, 1]
    test_pred = (test_proba >= 0.5).astype(int)
    test_acc = accuracy_score(y_test, test_pred)
    test_f1 = f1_score(y_test, test_pred)
    test_auc = roc_auc_score(y_test, test_proba)

    print(f"\nBest model on validation: {best_model_name}")
    print(f"[{best_model_name}] Test: "
          f"Acc={test_acc:.3f}, F1={test_f1:.3f}, AUC={test_auc:.3f}")

    return Bunch(
        best_model=best_model,
        best_model_name=best_model_name,
        val_results=results,
        test_metrics=dict(acc=test_acc, f1=test_f1, auc=test_auc)
    )


In [None]:
def main():
    print("Loading data...")
    tokens, comments = load_data(CFG)

    print("Building label (HighCap / LowCap)...")
    y = build_labels(tokens).values

    print("Building text features...")
    X_text, tfidf_vec = build_text_features(tokens)

    print("Building community features...")
    X_comm, _ = build_community_features(tokens, comments)

    print("Building financial features...")
    X_fin, fin_cols, fin_scaler = build_financial_features(tokens)

    print("Building image features (placeholder zeros)...")
    X_img, img_cols = build_image_features(tokens, CFG)

    print("Concatenating multimodal features...")
    X_multi = np.concatenate([X_text, X_comm, X_fin, X_img], axis=1)

    print(f"Shape of multimodal feature matrix: {X_multi.shape}")

    print("Splitting train/val/test...")
    splits = train_val_test_split(X_multi, y, CFG)

    print("Training and evaluating models...")
    results = train_and_evaluate_models(splits)

    os.makedirs("models", exist_ok=True)
    model_path = os.path.join("models", f"{results.best_model_name}_multimodal.joblib")
    joblib.dump(dict(
        model=results.best_model,
        tfidf=tfidf_vec,
        fin_scaler=fin_scaler,
        config=CFG
    ), model_path)

    print(f"\nBest model saved to: {model_path}")
    print("Done.")


if __name__ == "__main__":
    main()