In [1]:
%pip install transformers



In [2]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch,gc
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import transformers
import json
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import (
  BertTokenizerFast,
  AutoModelForSequenceClassification,
)
import logging
from torch import cuda

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv("/content/drive/MyDrive/文字探勘/final/shopee_item_category.csv")

In [5]:
df

Unnamed: 0,商品名稱,大類,小類
0,PHILIPS 飛利浦 全自動義式咖啡機-EP3246 (金)+湛盧咖啡豆券8張(24包),家用電器,咖啡機與周邊
1,PHILIPS 飛利浦 小旋風電動洗鞋機 (GCA1000),家用電器,其他家電
2,PHILIPS 飛利浦 2021旗艦款八合一乾濕兩用拔刮美體刀 除毛刀 BRE740,美妝保養,除毛器材
3,PHILIPS 飛利浦 電鬍刀刀頭 RQ11,美妝保養,電動刮鬍刀
4,PHILIPS 飛利浦 全自動義式咖啡機-EP3246 (金),家用電器,咖啡機與周邊
...,...,...,...
253280,白底青平安扣 贈精美中國繩乙入 天然緬甸硬玉A貨【文華珠寶翡翠專賣店】,愛好與收藏品,原石水晶
253281,Q版可愛牛牛玉墜 天然緬甸硬玉A貨 贈中國繩乙入【文華珠寶翡翠專賣店】,愛好與收藏品,原石水晶
253282,冰種平安扣 起螢放光 購買即贈中國繩乙入 天然緬甸硬玉A貨【文華珠寶翡翠專賣店】,愛好與收藏品,原石水晶
253283,冰種觀音玉墜 贈精美中國繩乙條 天然緬甸硬玉A貨【文華珠寶翡翠專賣店】,愛好與收藏品,原石水晶


In [6]:
# rename column
df = df.rename(columns = {"商品名稱":"ProductName","大類":"Category","小類":"SubCategory"})

In [7]:
# 清除括號內的文字
df["ProductName"] = df["ProductName"].replace(r'\([^)]*\)',"",regex=True)
# 清除特殊符號
special_symbols = r'[.．＂/<>:《》+\-=#$%&()~*@＃＄％＆＇\(\)\[\]\{\}（）＊＋－～／：\＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏★→─]+'
df["ProductName"] = df["ProductName"].replace(special_symbols,"",regex=True)
# 清除任何由英文及數字組成的字串
df["ProductName"] = df["ProductName"].replace(r'[A-Z]+[0-9]+',"",regex=True)
# 清除空格
df["ProductName"] = df["ProductName"].replace(r'\s',"",regex=True)

In [8]:
# label the Catoegory,sub-category to "Mask" except "衣著"
df.loc[(df['Category'] != "男生衣著") & (df['Category'] != "女生衣著"), 'SubCategory'] = "Mask"
df.loc[(df['Category'] != "男生衣著") & (df['Category'] != "女生衣著"), 'Category'] = "Mask"

In [9]:
df["Category"].value_counts()

Mask    222210
女生衣著     21045
男生衣著     10030
Name: Category, dtype: int64

In [10]:
df["SubCategory"].value_counts()

Mask          222210
T恤              7916
牛仔褲             2756
襪子              2536
內褲              2147
內衣              1806
襯衫              1706
洋裝              1643
長褲              1607
短褲              1119
裙裝               946
毛衣 / 針織          897
大學T              818
帽T               799
細肩帶 / 繞頸背心       553
外套               512
冬季外套 / 大衣        472
夾克               291
內衣褲              227
衛生衣              225
Polo衫            216
吊帶褲 / 吊帶裙        193
其他               182
個人套裝             154
成套內衣褲            123
工作褲              121
緊身褲 / 內搭褲        106
絲襪               101
背心外套              96
縮口褲               90
其他外套              90
哺乳內衣              83
斗篷                75
褲裙                73
西裝                71
背心                69
坦克                62
居家服               56
塑身衣               55
連身長褲              52
上衣                16
套裝                15
Name: SubCategory, dtype: int64

In [11]:
# using gpu for training
device = 'cuda' if cuda.is_available() else 'cpu'

# Preparing

In [12]:
# sampling data
tmp1 = df[df["Category"] != "Mask"]
tmp2 = (df[df["Category"] == "Mask"]).sample(frac = 0.1, random_state = 12)

df_sample = pd.concat([tmp1,tmp2])


In [13]:
df_sample["Category"].value_counts()

Mask    22221
女生衣著    21045
男生衣著    10030
Name: Category, dtype: int64

In [14]:
df_sample.to_csv("./training.csv")

In [None]:
# encoding the label for classifier's target
df_sample['label'] = df_sample['Category'] + ' ' + df_sample['SubCategory']
df_sample['label'], _ = pd.factorize(df_sample['label'])

In [None]:
df_sample["label"].value_counts()

53    22221
7      4136
0      3780
23     2147
21     1806
3      1643
24     1559
5      1472
8      1328
40     1284
1      1257
26      977
2       946
6       897
10      634
4       553
28      533
18      512
43      511
41      485
15      378
16      350
11      342
13      291
29      288
33      285
37      227
48      225
9       193
46      176
32      154
38      130
39      123
42      121
36      112
25      106
22      101
34       96
45       90
12       90
30       83
19       75
31       73
14       71
20       70
17       69
51       62
44       56
27       55
35       52
47       40
50       16
49        8
52        7
Name: label, dtype: int64

In [None]:
Pname = df_sample["ProductName"].to_numpy()
label = df_sample["label"].to_numpy()

In [None]:
# load model and tokenizer and define length of the text sequence
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [None]:
# split the training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(Pname, label, test_size=0.2, random_state=12)

In [None]:
# set the hyper parameter
MAX_LENGTH = 64
BATCH_SIZE = 128
LEARNING_RATE = 2e-5
EPOCH_NUM = 6

# Tokenization

In [None]:
def tokenization(Pname,label):
  input_ids = []
  attention_masks = []

  for name in Pname:
    encoded_dict = tokenizer.encode_plus(
        name,
        add_special_tokens = True,
        max_length = MAX_LENGTH,
        padding = "max_length",
        truncation = True,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt',
    )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask
    attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(label)
  return input_ids,attention_masks,labels


In [None]:
input_ids,attention_masks,labels = tokenization(X_train,y_train)

In [None]:
dataset = TensorDataset(input_ids,attention_masks,labels)
train_dataloader = DataLoader(
    dataset,
    batch_size = BATCH_SIZE,
    shuffle = True
)

In [None]:
# del dataset

# Training

In [None]:
# setting the ckip/bert model
model = AutoModelForSequenceClassification.from_pretrained(
    "ckiplab/bert-base-chinese",
    num_labels = 54,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# defined the optimizer
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = LEARNING_RATE,
                  eps = 1e-8
                )
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# start training
for epoch in range(EPOCH_NUM):
  count_loss = 0
  model.train()
  print("Epoch:",epoch)
  for batch_num, batch in enumerate(train_dataloader):
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    optimizer.zero_grad()

    outputs = model(input_ids,attention_mask = attention_mask,labels = labels)
    loss = outputs.loss
    loss.backward()
    count_loss+=loss
    optimizer.step()

  print("Avg loss:",count_loss/(len(X_train) / BATCH_SIZE))
  gc.collect()
  torch.cuda.empty_cache()

Epoch: 0
Avg loss: tensor(1.1087, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 1
Avg loss: tensor(0.4549, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 2
Avg loss: tensor(0.3432, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 3
Avg loss: tensor(0.2828, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 4
Avg loss: tensor(0.2439, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 5
Avg loss: tensor(0.2119, device='cuda:0', grad_fn=<DivBackward0>)


In [None]:
# Evaluation
model.eval()
input_ids_val,attention_masks_val,labels_val = tokenization(X_test,y_test)
val_dataset = TensorDataset(input_ids_val,attention_masks_val,labels_val)
val_dataloader = DataLoader(
    val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True
)

In [None]:
# split the testing data into batches, so the RAM won't out of memory
predicted_labels = np.array([])
batches = 0
temp_acc = []
temp_pred = []
temp_label = []
with torch.no_grad():
  for batch_num, batch in enumerate(val_dataloader):
    batches += 1
    input_ids_val,attention_masks_val,labels_val = batch
    input_ids_val,attention_masks_val = input_ids_val.to(device),attention_masks_val.to(device)
    logits = model(input_ids_val,attention_mask = attention_masks_val)
    temp_predict = np.argmax(logits.logits.cpu().numpy(),axis = 1)
    labels_val = labels_val.numpy()
    temp_label = np.concatenate((temp_label,labels_val),axis = 0)
    temp_pred = np.concatenate((temp_pred,temp_predict),axis = 0)
    temp_acc.append(accuracy_score(labels_val,temp_predict))



accuracy = np.sum(temp_acc) / batches
print(f'Accuracy:{accuracy:.2f}')

Accuracy:0.93


In [None]:
torch.save(model, '/content/drive/MyDrive/文字探勘/final/model.pt')