In [3]:
import os, re, shutil
import json
import jieba, numpy
import string, gensim
import threading
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec
from collections import Counter
from tqdm import tqdm

In [4]:
def extract_jfull(filename: str):
    with open(os.path.join(dirpath + '/drug_case', filename), "r", encoding = 'UTF-8') as f:
        data = json.load(f)['JFULL']
        data = data.replace(' ', '').replace('　', '')
    target0 = data.find('主文\r\n')
    # 如果案字帶有秩的，代表為社會秩序維護法的案子，並不在我們的管理範圍內
    if "秩" in filename: 
        return ""
    
    # 如果 JFULL 找不到主文，那就代表是 pdf 檔案
    # 暫定不處置
    if target0 != -1:
        target1 = []
        if data.find('事實及理由\r\n') != -1:
            target1.append(data.find('事實及理由\r\n', target0))
        if data.find('犯罪事實\r\n') != -1:
            target1.append(data.find('犯罪事實\r\n', target0))
        if data.find('理由\r\n') != -1:
            target1.append(data.find('理由\r\n', target0))
        if data.find('事實\r\n') != -1:
            target1.append(data.find('事實\r\n', target0))
        if data.find('中華民國') != -1:
            target1.append(data.find('中華民國', target0))
        target1 = min(target1)

        data = data[target1:].replace('\r\n', '')
        data = re.split(r'[，。「」（）『』【】；：、]', data)
        return data
    else:
        return ""

In [47]:
desktop = os.environ['USERPROFILE'] + '/Desktop'
dirpath = desktop + '/drug_dataset'

model = Doc2Vec.load(dirpath + "/drug_model.bin")
filelist = os.listdir(dirpath)
tmp = open(dirpath + '/label.txt', 'r', encoding = 'utf-8').readlines()
label = {}
train = []
train_label = []


for i in tqdm(tmp):
    filename, sencnt = i.split()
    label[filename] = sencnt
    # print(model.infer_vector(extract_jfull(filename)))
    # article = extract_jfull(filename)
    # while len(article) < 300: article = article + article
    # article = article[:300]
    # artvec = []
    # for i in article:
    #     artvec.extend(model.infer_vector([i]))
    train.append(model.infer_vector(extract_jfull(filename)))
    train_label.append(sencnt)

train = np.array(train)
train_label = np.array(train_label)

print(train.shape, train_label.shape)

100%|██████████| 24669/24669 [00:27<00:00, 884.98it/s] 

(24669, 300) (24669,)
['犯罪事實一', '郭權葳基於施用第一', '二級毒品之犯意', '於民國108年11月20日12時許', '在彰化縣鹿港鎮臺17線附近', '於其使用之汽車內', '以將海洛因與甲基安非他命置入玻璃球管內', '再以火焰燒灼玻璃球管', '使其內海洛因與甲基安非他命轉化為煙霧狀後', '以口鼻吸食之方式', '同時施用第一級毒品海洛因與第二級毒品甲基安非他命1次', '嗣於108年11月21日0時5分許', '在彰化縣鹿港鎮永康路與永寧街98巷口', '因毒品通緝案件為警查獲', '並扣得第一級毒品海洛因3包', '驗餘淨重共2.87公克', '', '第二級毒品甲基安非他命1包', '驗餘淨重1.1325公克', '', '殘渣袋1批', '分裝袋1批及塑膠鏟管5支等物', '經警採集其尿液送驗', '結果呈嗎啡', '可待因', '安非他命及甲基安非他命陽性反應', '二', '案經彰化縣警察局鹿港分局報告臺灣彰化地方檢察署檢察官偵查起訴', '理由一', '上開犯罪事實', '業經被告郭權葳於本院準備', '審理程序時坦承不諱', '且有搜索扣押筆錄', '扣押物品目錄表', '房屋租賃契約書', '蒐證照片', '扣案物照片', '毒品初步檢驗報告單', '彰化縣警察局鹿港分局尿液代號與真實姓名對照認證單', '臺灣檢驗科技股份有限公司出具之濫用藥物檢驗報告', '扣案之海洛因3包', '甲基安非他命1包', '塑膠剷管5支', '殘渣袋1組', '分裝袋1組', '法務部調查局濫用藥物實驗室鑑定書', '衛生福利部草屯療養院鑑驗書在卷可以佐證', '足見被告前揭任意性自白與事實相符', '從而', '本案事證明確', '應依法論科', '二', '核被告所為係犯毒品危害防制條例第10條第1項之施用第一級毒品罪及同條第2項之施用第二級毒品罪', '被告為施用毒品而持有第一', '二級毒品', '其持有之低度行為均應為施用之高度行為所吸收', '均不另論罪', '又被告於上開時', '地', '同時施用第一', '二級毒品', '係以一行為觸犯施用第一級毒品及施用第二級毒品2罪', '為想像競合犯', '應從一重論以施用第一級毒品罪', '三', '本件被告已為認罪之表示', '且經檢察官與被告於審判外達成協商之




## Create Dataset

In [6]:
import torch
from torch.utils.data import Dataset

class DrugDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = torch.from_numpy(X).float()
        if y is not None:
            y = y.astype(np.int)
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)

Split the labeled data into a training set and a validation set, you can modify the variable `VAL_RATIO` to change the ratio of validation data.

In [8]:
VAL_RATIO = 0.1

percent = int(train.shape[0] * (1 - VAL_RATIO))
train_x, train_y, val_x, val_y = train[:percent], train_label[:percent], train[percent:], train_label[percent:]
print('Size of training set: {}'.format(train_x.shape))
print('Size of validation set: {}'.format(val_x.shape))

Size of training set: (22202, 300)
Size of validation set: (2467, 300)


Create a data loader from the dataset, feel free to tweak the variable `BATCH_SIZE` here.

In [10]:
BATCH_SIZE = 256

from torch.utils.data import DataLoader

train_set = DrugDataset(train_x, train_y)
val_set = DrugDataset(val_x, val_y)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) #only shuffle the training data
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = y.astype(np.int)


Cleanup the unneeded variables to save memory.<br>

**notes: if you need to use these variables later, then you may remove this block or clean up unneeded variables later<br>the data size is quite huge, so be aware of memory usage in colab**

## Create Model

Define model architecture, you are encouraged to change and experiment with the model architecture.

In [70]:
import torch as t
import torch.nn as nn

class LSTM(nn.Module):
#建立LSTM class
    def __init__(self, input_dim = 300, hidden_dim = 512, layer_dim = 4, output_dim = 1):
        super(LSTM,self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.output_dim = output_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first = True, dropout = 0.5, bidirectional = True)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        return out

## Training

In [71]:
#check device
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'
device = get_device()
print(device)

cuda


Feel free to change the training parameters here.

In [73]:
model = LSTM().to(device)
model.device = device
if os.path.exists('best_weight.pth'):
    try:
        model.load_state_dict(torch.load('best_weight.pth'))
    except: 
        print('Failed to load the model weight!')

# For the classification task, we use cross-entropy as the measurement of performance.
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4, weight_decay = 1e-5)

# The number of training epochs.
n_epochs = 100
do_semi = True

for epoch in range(n_epochs):
    model.train()
    train_loss = []
    train_accs = []

    # Iterate the training set by batches.
    for batch in tqdm(train_loader):
        imgs, labels = batch
        logits = model(imgs.to(device))
        loss = criterion(logits, labels.to(device))

        # Gradients stored in the parameters in the previous step should be cleared out first.
        optimizer.zero_grad()
        # Compute the gradients for parameters.
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
        optimizer.step()
        acc = (logits == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        train_loss.append(loss.item())
        train_accs.append(acc)

    # The average loss and accuracy of the training set is the average of the recorded values.
    train_loss = sum(train_loss) / len(train_loss)
    train_acc = sum(train_accs) / len(train_accs)

    # ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    model.eval()

    # These are used to record information in validation.
    valid_loss = []
    valid_accs = []

    # Iterate the validation set by batches.
    for batch in tqdm(val_loader):
        imgs, labels = batch
        with torch.no_grad():
          logits = model(imgs.to(device))
        loss = criterion(logits, labels.to(device))
        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()
        valid_loss.append(loss.item())
        valid_accs.append(acc)

    # The average loss and accuracy for entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / len(valid_loss)
    valid_acc = sum(valid_accs) / len(valid_accs)
    best_acc = 0
    
    if valid_acc > best_acc:
        best_model = model
        best_acc = valid_acc
        torch.save(model.state_dict(), "best_weight.pth")

    # Print the information.
    print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
    print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

## Testing

Create a testing dataset, and load model from the saved checkpoint.

In [49]:
# create testing dataset
test_set = DrugDataset(val_x)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

Make prediction and output the result.

In [55]:
predict = []
model.eval() # set the model to evaluation mode
with torch.no_grad():
    for batch in tqdm(test_loader):
        imgs = batch
        with torch.no_grad():
            logits = model(imgs.to(device))
            answer = logits.argmax(dim=-1).cpu().numpy().tolist()
            predict.extend(answer)
            print(answer)


 40%|████      | 4/10 [00:00<00:00,  9.56it/s]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4]
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

100%|██████████| 10/10 [00:00<00:00, 13.83it/s]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4]
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,




In [38]:
# put the result to csv

with open(desktop + '/prediction.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(predict):
        f.write('{},{}\n'.format(i, y))