In [1]:
import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
TEST_SIZE = 0.2
FOLDER_PATH = "../data"
#トレーニングデータと検証用のテストデータに分割
def split_data_frame(df:pandas.DataFrame, objectiv_col:str):
    df_train, df_val =train_test_split(df, test_size=TEST_SIZE, random_state=42)
    train_y = df_train[objectiv_col]
    train_x = df_train.drop(objectiv_col, axis=1)

    val_y = df_val[objectiv_col]
    val_x = df_val.drop(objectiv_col, axis=1)
    return [train_x, train_y, val_x, val_y]

def clean_text(text):
    return text.replace(' ', '').replace('　', '').replace('__BR__', '\n').replace('\xa0', '').replace('\r', '').lstrip('\n')

df_hate_train = pandas.read_csv(FOLDER_PATH + "/train.csv", index_col=0)
df_hate_train.drop("source", axis=1)
df_hate_test = pandas.read_csv(FOLDER_PATH + "/test.csv", index_col=0)
df_hate_test.drop("source", axis=1)
df_hate_train['text'] = df_hate_train['text'].apply(clean_text)
df_hate_test['text'] = df_hate_test['text'].apply(clean_text)
df_tweet = pandas.read_csv(FOLDER_PATH + "/tweet.csv", index_col=0)
df_tweet['text'] = df_tweet['text'].apply(clean_text)

train_x, train_y, val_x, val_y = split_data_frame(df_hate_train, "label")

In [11]:
from transformers import AlbertTokenizer, AlbertForPreTraining
import torch

tokenizer = AlbertTokenizer.from_pretrained('ALINEAR/albert-japanese-v2')
model = AlbertForPreTraining.from_pretrained('ALINEAR/albert-japanese-v2')

count = 0
max = len(train_x["text"]) + len(val_x["text"]) + len(df_tweet["text"])
print_num = max % 10

def vectorize(text:str):
  # 進捗表示
  global count
  count += 1
  if count % 10 == print_num:
    print("\r", f"{count}/{max}", end="")

  input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
  outputs = model(input_ids)
  return torch.flatten(outputs.prediction_logits).tolist()

Some weights of AlbertForPreTraining were not initialized from the model checkpoint at ALINEAR/albert-japanese-v2 and are newly initialized: ['sop_classifier.classifier.bias', 'sop_classifier.classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
count = 0 # 進捗のスタートリセットするため
train_vectors = train_x["text"].apply(vectorize)
val_vectors = val_x["text"].apply(vectorize)
tweet_vectors = df_tweet["text"].apply(vectorize)

 567/5307

In [None]:
%matplotlib inline

def print_probas(y, probas):
  fpr, tpr, threshold = roc_curve(y, probas[:, 1])
  plt.style.use("fivethirtyeight")
  fig, ax = plt.subplots()
  fig.set_size_inches(4.8, 5)

  ax.step(fpr, tpr, "gray")
  ax.fill_between(fpr, tpr, 0, color="skyblue", alpha=0.8)
  ax.set_xlabel("False Positive Rate")
  ax.set_ylabel("True Positive Rate")
  ax.set_facecolor("xkcd:white")

  print("AUC:" + str(roc_auc_score(y, probas[:, 1])))
  plt.show()

def calculate(model):
  # fit行う
  model.fit(train_vectors, train_y)
  predict(model)

def predict(model):
  print("-----------------------train.csvデータ----------------")
  print("正解率:" + str(accuracy_score(val_y, model.predict(val_vectors))))
  print_probas(val_y, model.predict_proba(val_vectors))
  print("-----------------------tweet.csvデータ----------------")
  print("正解率:" + str(accuracy_score(df_tweet["label"], model.predict(tweet_vectors))))
  print_probas(df_tweet["label"], model.predict_proba(tweet_vectors))

In [None]:
# 線形回帰

In [None]:
#ニューラルネットワーク
from sklearn.neural_network import MLPClassifier
calculate(MLPClassifier(hidden_layer_sizes=(16,), random_state=42))