In [1]:
from memory_profiler import memory_usage, profile

In [2]:
# region 导入模块

import json
import logging
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import transformers
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from torch import cuda
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import BertConfig, BertModel, BertTokenizer, DistilBertModel, DistilBertTokenizer

import wandb

logging.basicConfig(level=logging.ERROR)
warnings.simplefilter("ignore")

# endregion


In [3]:
memory_usage(-1, interval=.2, timeout=1)

[341.78125, 341.78125, 341.78125, 341.78125, 341.78125]

In [4]:
train_file = "../multi_label_data/econbiz/train_data.json"
test_file = "../multi_label_data/econbiz/test_data.json"
with open(train_file, "r", encoding="utf-8") as f:
    train_df = pd.DataFrame(json.load(f))
with open(test_file, "r", encoding="utf-8") as f:
    test_df = pd.DataFrame(json.load(f))

keep_cols = ["text", "labels"]
train_df = train_df[keep_cols]
test_df = test_df[keep_cols]

In [41]:
train_df["text"][2]

'Below-replacement fertility in industrial societies : causes, consequences, policies ; based on papers presented at a seminar held at the Hoover Institution, Stanford University, November 1985'

In [10]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_df["labels"])

MultiLabelBinarizer()

In [31]:
train_df["labels"][0]

['11271-0', '15912-3', '17983-5']

In [36]:
label_encoder.transform([['11271-0', '15912-3', '17983-5']])[0].shape

(5658,)

In [23]:
test_labels_enc = label_encoder.transform(train_df.iloc[:100]["labels"])

In [30]:
test_labels_enc[0]

array([0, 0, 0, ..., 0, 0, 0])

In [5]:
print(train_df.shape)
print(train_df.memory_usage(deep=True) / 1024 / 1024)

print(test_df.shape)
print(test_df.memory_usage(deep=True) / 1024 / 1024)

(994015, 2)
Index       0.000122
text      123.281032
labels     89.671364
dtype: float64
(70619, 2)
Index     0.000122
text      8.959422
labels    6.947754
dtype: float64


In [6]:
memory_usage(-1, interval=.2, timeout=1)

[1210.62109375, 1210.62109375, 1210.62109375, 1210.62109375, 1210.62109375]

In [7]:
# 这一步花了太多内存了, 最后创建的矩阵太大了 维度大约是 (100w, 5658), 暴涨 10 GB 内存
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_df["labels"].to_numpy())
train_labels_enc = label_encoder.transform(train_df["labels"].to_numpy())
test_labels_enc = label_encoder.transform(test_df["labels"].to_numpy())

In [8]:
memory_usage(-1, interval=.2, timeout=1)

[12739.66796875,
 12739.66796875,
 12739.66796875,
 12739.66796875,
 12739.66796875]

In [9]:
print(type(train_labels_enc))
print(train_labels_enc.dtype)
print(train_labels_enc.shape)
print(train_labels_enc.nbytes / 1024 / 1024)

<class 'numpy.ndarray'>
int32
(994015, 5658)
21454.379539489746


In [10]:
train_labels_enc = train_labels_enc.astype(np.int8, copy=False)
test_labels_enc = test_labels_enc.astype(np.int8, copy=False)

In [11]:
memory_usage(-1, interval=.2, timeout=1)

[6956.40234375, 6956.40234375, 6956.40234375, 6956.40234375, 6956.40234375]

In [None]:
# 这个操作也要内存爆炸了
# train_df["labels"] = train_labels_enc.tolist()
# test_df["labels"] = test_labels_enc.tolist()

In [13]:
train_df["labels"] = list(map(lambda x: np.squeeze(x), np.split(train_labels_enc, train_labels_enc.shape[0])))
test_df["labels"] = list(map(lambda x: np.squeeze(x), np.split(test_labels_enc, test_labels_enc.shape[0])))

In [14]:
memory_usage(-1, interval=.2, timeout=1)

[7012.671875, 7012.671875, 7012.671875, 7012.671875, 7012.671875]

In [29]:
a = np.array([]).reshape(0, 4)
b = np.arange(12).reshape(3, 4)
c = np.arange(16).reshape(4, 4)

In [31]:
np.concatenate([a, b], axis=0)

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])