In [None]:
import numpy as np
import pandas as pd

In [None]:
csv_file = "../data/tweet_activity_metrics_tkosht_20240106_20240203_ja.csv"
df_posts = pd.read_csv(csv_file)
df_posts

In [None]:
tsv_file = "../data/x_refs.tsv"
df_refs = pd.read_csv(tsv_file, sep="\t", header=0)
df_refs

In [None]:
df = pd.concat([df_posts, df_refs], axis=1)
df

In [None]:
df = df.rename({"url": "参照URL", "text": "参照テキスト"}, axis=1).fillna("(NULL)")
df.head(3)

In [None]:
df.columns

In [None]:
df["ツイート本文"].to_list()[:3]

In [None]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")
# embedder = AutoModel.from_pretrained("intfloat/multilingual-e5-small")
# tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-base")
# embedder = AutoModel.from_pretrained("intfloat/multilingual-e5-base")
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large")
embedder = AutoModel.from_pretrained("intfloat/multilingual-e5-large")

In [None]:
import torch
from transformers.models.bert.modeling_bert import BertModel


def embed(
    embedder: BertModel,
    input_texts=list[str],
    do_normalize: bool = False,
    device: torch.device = torch.device("cpu"),
):
    batch_dict = tokenizer(
        input_texts,
        max_length=512,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    embedder.eval()
    embedder.to(device)

    with torch.no_grad():
        outputs = embedder(**batch_dict)
        embeddings = average_pool(
            outputs.last_hidden_state, batch_dict["attention_mask"]
        )
        if do_normalize:
            embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings

In [None]:
embeddings_post = embed(
    embedder, input_texts=df["ツイート本文"].to_list(), device=torch.device("cuda:1")
)

In [None]:
embeddings_ref = embed(
    embedder, input_texts=df["参照テキスト"].to_list(), device=torch.device("cuda:1")
)

In [None]:
# W = embedder.embeddings.word_embeddings.weight
# with torch.no_grad():
#     D = embeddings_post @ W.T
# tokenizer.decode(torch.argmax(D, dim=1)[1])

In [None]:
cos = F.normalize(embeddings_post) @ F.normalize(embeddings_ref).T
sin = torch.sqrt(1 - cos**2)

In [None]:
a = torch.linalg.norm(embeddings_post, dim=1)
b = torch.linalg.norm(embeddings_ref, dim=1)

# # NOTE: scores: 対応する2つのベクトルが成す三角形の面積
# scores = (1 / 2) * a * b * sin

# NOTE: 扇形の面積
scores = torch.sqrt(a * b) * torch.arccos(cos) / torch.pi
scores

In [None]:
score_array = torch.diag(scores).cpu().numpy()
score_array

In [None]:
df["score"] = score_array

In [None]:
df

In [None]:
df.columns

In [None]:
df.select_dtypes("number").drop("ツイートID", axis=1).dropna(axis=1)

In [None]:
df_corr = df[
    [
        "インプレッション",
        "エンゲージメント",
        "リツイート",
        "いいね",
        "ユーザープロフィールクリック",
        "URLクリック数",
        "詳細クリック",
        "score",
    ]
].corr()
df_corr

In [None]:
import seaborn as sns

cmap = sns.color_palette("coolwarm", 200)
sns.heatmap(df_corr, square=True, annot=True)

In [None]:
df["インプレッション"].hist()

In [None]:
md = ((df["インプレッション"] / 10).round() * 10).mode()
md

In [None]:
m = df["インプレッション"].median()
m

In [None]:
# NOTE: クラス分類にする
df["high_impression"] = (df["インプレッション"] > m).astype(int)
df["high_impression"]

In [None]:
seed = 42

X = F.normalize(embeddings_post).cpu().numpy()
y = df[["high_impression"]].to_numpy()

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss


def run_cross_validation(X, y, params: dict, n_splits: int = 5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    scores = []
    for train_indices, valid_indices in kf.split(X):
        X_train, X_valid = X[train_indices], X[valid_indices]
        y_train, y_valid = y[train_indices], y[valid_indices]

        model = lgb.LGBMClassifier(**params)

        model.fit(X_train, y_train)
        p = model.predict(X_valid)
        score = log_loss(y_valid, p)
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [None]:
params = dict(
    learning_rate=0.05,
    num_leaves=5,
    num_trees=256,
    num_threads=8,
    # max_depth=8,
    # min_data_in_leaf=0,
    min_samples_in_leaf=3,
    # min_sum_hessian_in_leaf=100,
    random_state=seed,
)
scores = run_cross_validation(X, y, params)
scores

In [None]:
import optuna
import sklearn
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

fixed_params = dict(
    num_threads=8,
    min_samples_in_leaf=3,
    # min_data_in_leaf=0,
    # min_sum_hessian_in_leaf=100,
    random_state=seed,
    objective="binary",
    metric="binary_logloss",
)


class Evaluator(object):
    def __init__(self, X: np.ndarray, y: np.ndarray) -> None:
        self.X = X
        self.y = y

    def objective(self, trial: optuna.Trial):
        X_train, X_valid, y_train, y_valid = train_test_split(
            self.X,
            self.y,
            test_size=0.3,
            shuffle=True,
            random_state=42,
        )
        trainset = lgb.Dataset(X_train, label=y_train)
        params = dict(
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.1),
            num_leaves=trial.suggest_int("num_leaves", 2, 256),
            num_trees=trial.suggest_int("num_trees", 10, 1000),
            max_depth=trial.suggest_int("max_depth", 4, 5),
            lambda_l1=trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            lambda_l2=trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            feature_fraction=trial.suggest_float("feature_fraction", 0.4, 1.0),
            bagging_fraction=trial.suggest_float("bagging_fraction", 0.4, 1.0),
            bagging_freq=trial.suggest_int("bagging_freq", 1, 7),
            min_child_samples=trial.suggest_int("min_child_samples", 5, 40),
        )
        params.update(fixed_params)

        model = lgb.train(params=params, train_set=trainset)
        p_rates = model.predict(X_valid)
        p = np.rint(p_rates)
        accuracy = sklearn.metrics.accuracy_score(y_valid, p)
        return accuracy

In [None]:
evl = Evaluator(X, y)


study = optuna.create_study(direction="maximize")
study.optimize(evl.objective, n_trials=100)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
study.best_params

In [None]:
import optuna.visualization


optuna.visualization.plot_contour(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.3,
    shuffle=True,
    random_state=42,
)

In [None]:
trainset = lgb.Dataset(data=X_train, label=y_train)
validset = lgb.Dataset(data=X_valid, label=y_valid, reference=trainset)
params = study.best_params.copy()
params.update(fixed_params)

evals = {}
model = lgb.train(
    params=params,
    train_set=trainset,
    valid_sets=[trainset, validset],
    valid_names=["trainset", "validset"],
    callbacks=[lgb.record_evaluation(evals)],
)

In [None]:
params

In [None]:
model

In [None]:
lgb.plot_metric(evals)

In [None]:
df[["ツイート本文"]].shape

In [None]:
df[["high_impression"]].replace(1, "high_impression").replace(
    0, "low_impression"
).to_numpy()

In [None]:
X = df[["ツイート本文"]].to_numpy()
y = (
    df[["high_impression"]]
    .replace(1, "high_impression")
    .replace(0, "low_impression")
    .to_numpy()
    .ravel()
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.3,
    shuffle=True,
    random_state=42,
)

In [None]:
import lightgbm
from sklearn.decomposition import PCA  # , KernelPCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from app.morph.classify import (
    JpTokenizerJanome,
    JpTokenizer,
    ident_tokener,
    Transer,
    SparsetoDense,
)

In [None]:
def build_pipleline_with_tfidf(tokener: JpTokenizer, n_classes: int, params: dict = {}):
    tfidf = TfidfVectorizer(tokenizer=ident_tokener, lowercase=False)

    n_components = params.pop("n_components", 2)
    embedders = [
        ("pca", PCA(n_components=n_components)),
        ("identity", Transer()),  # means tfidf to tfidf
    ]

    lgbmclf = lightgbm.LGBMClassifier(**params)

    pipe = Pipeline(
        steps=[
            ("tokenizer", tokener),
            ("vectorizer", tfidf),
            ("to_dence", SparsetoDense()),
            ("embedder", FeatureUnion(embedders)),
            ("classifier", lgbmclf),
        ]
    )

    return pipe

In [None]:
n_classes = len(set(y_train.ravel().tolist()))
is_binary = n_classes == 2

fixed_params = dict(
    num_threads=8,
    min_samples_in_leaf=3,
    random_state=seed,
    objective="binary" if is_binary else "softmax",
    metric="binary_logloss" if is_binary else None,
    num_class=None if is_binary else n_classes,
    importance_type="gain",
)


class PipelineEvaluator(object):
    def __init__(self, X: np.ndarray, y: np.ndarray) -> None:
        self.X = X
        self.y = y

    def objective(self, trial: optuna.Trial):
        X_train, X_valid, y_train, y_valid = train_test_split(
            self.X,
            self.y,
            test_size=0.3,
            shuffle=True,
            random_state=42,
        )
        params = dict(
            n_components=trial.suggest_int("n_components", 2, 64),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.1),
            num_leaves=trial.suggest_int("num_leaves", 2, 256),
            num_trees=trial.suggest_int("num_trees", 10, 1000),
            max_depth=trial.suggest_int("max_depth", 4, 5),
            lambda_l1=trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            lambda_l2=trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            feature_fraction=trial.suggest_float("feature_fraction", 0.4, 1.0),
            bagging_fraction=trial.suggest_float("bagging_fraction", 0.4, 1.0),
            bagging_freq=trial.suggest_int("bagging_freq", 1, 7),
            min_child_samples=trial.suggest_int("min_child_samples", 5, 40),
        )
        params.update(fixed_params)

        tokener = JpTokenizerJanome()
        n_classes = len(set(y_train.ravel().tolist()))
        pipe = build_pipleline_with_tfidf(
            tokener=tokener, n_classes=n_classes, params=params
        )

        pipe.fit(X_train, y_train)
        p = pipe.predict(X_valid)
        accuracy = sklearn.metrics.accuracy_score(y_valid, p)
        return accuracy

In [None]:
pev = PipelineEvaluator(X, y)
study = optuna.create_study(direction="maximize")
study.optimize(pev.objective, n_trials=100)

print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
tokener = JpTokenizerJanome()
n_classes = len(set(y_train.ravel().tolist()))
params = study.best_params.copy()
params.update(fixed_params)
pipe = build_pipleline_with_tfidf(tokener=tokener, n_classes=n_classes, params=params)
pipe

In [None]:
pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score


# predict trainset
p = pipe.predict(X_train)
train_acc = accuracy_score(y_train, p)

In [None]:
train_acc

In [None]:
# predict validset
p = pipe.predict(X_valid)
valid_acc = accuracy_score(y_valid, p)

In [None]:
valid_acc