In [None]:
import pandas
from janome.tokenizer import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
TEST_SIZE = 0.2
#トレーニングデータと検証用のテストデータに分割
def split_data_frame(df:pandas.DataFrame, objectiv_col:str):
    df_train, df_val =train_test_split(df, test_size=TEST_SIZE, random_state=42)
    train_y = df_train[objectiv_col]
    train_x = df_train.drop(objectiv_col, axis=1)

    val_y = df_val[objectiv_col]
    val_x = df_val.drop(objectiv_col, axis=1)
    return [train_x, train_y, val_x, val_y]

def clean_text(text):
    return text.replace(' ', '').replace('　', '').replace('__BR__', '\n').replace('\xa0', '').replace('\r', '').lstrip('\n')

df_hate_train = pandas.read_csv("hate/train.csv", index_col=0)
df_hate_train.drop("source", axis=1)
df_hate_test = pandas.read_csv("hate/test.csv", index_col=0)
df_hate_test.drop("source", axis=1)
df_hate_train['text'] = df_hate_train['text'].apply(clean_text)
df_hate_test['text'] = df_hate_test['text'].apply(clean_text)
df_tweet = pandas.read_csv("hate/tweet.csv", index_col=0)
df_tweet['text'] = df_tweet['text'].apply(clean_text)

train_x, train_y, val_x, val_y = split_data_frame(df_hate_train, "label")

# bert
from transformers import BertJapaneseTokenizer, BertModel
import torch

class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)
    

In [None]:
MODEL_NAME = "sonoisa/sentence-bert-base-ja-mean-tokens-v2"
bert = SentenceBertJapanese(MODEL_NAME)

print("bert model create")

train_vectors = bert.encode(train_x['text'])
val_vectors = bert.encode(val_x['text'])
tweet_vectors = bert.encode(df_tweet['text'])


print("text vectors create")

In [None]:
%matplotlib inline

def print_probas(y, probas):
  fpr, tpr, threshold = roc_curve(y, probas[:, 1])
  plt.style.use("fivethirtyeight")
  fig, ax = plt.subplots()
  fig.set_size_inches(4.8, 5)

  ax.step(fpr, tpr, "gray")
  ax.fill_between(fpr, tpr, 0, color="skyblue", alpha=0.8)
  ax.set_xlabel("False Positive Rate")
  ax.set_ylabel("True Positive Rate")
  ax.set_facecolor("xkcd:white")

  print("AUC:" + str(roc_auc_score(y, probas[:, 1])))
  plt.show()

def calculate(model):
  # fit行う
  model.fit(train_vectors, train_y)
  predict(model)

def predict(model):
  print("-----------------------train.csvデータ----------------")
  print("正解率:" + str(accuracy_score(val_y, model.predict(val_vectors))))
  print_probas(val_y, model.predict_proba(val_vectors))
  print("-----------------------tweet.csvデータ----------------")
  print("正解率:" + str(accuracy_score(df_tweet["label"], model.predict(tweet_vectors))))
  print_probas(df_tweet["label"], model.predict_proba(tweet_vectors))

In [None]:
#ニューラルネットワーク
from sklearn.neural_network import MLPClassifier
calculate(MLPClassifier(hidden_layer_sizes=(16,), random_state=42))