In [None]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib
from matplotlib import font_manager
import matplotlib.pyplot as plt
from datetime import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.optim import AdamW

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("/root/bert-base-chinese")
model = BertForSequenceClassification.from_pretrained("/root/bert-base-chinese", num_labels=8).to(device)
state_dict = torch.load("models/BERT_1211_132515/bert_epoch_10.pthh", map_location=device)
model.load_state_dict(state_dict)

In [None]:
df = pd.read_csv("data/1976_clean.csv")

model.eval()

embeddings = []
for row in tqdm(df.itertuples(), total=len(df)):
    with torch.no_grad():
        encoding = tokenizer(
            row.notes_list, 
            truncation=True, 
            padding='max_length', 
            max_length=128, 
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        embeddings.append(cls_embedding)

embeddings_df = pd.DataFrame(embeddings)
embeddings_df['name'] = df['name']
embeddings_df.to_csv("data/1976_embeddings_bert.csv", index=False)