In [1]:
import jieba
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, glob, time, copy, random, zipfile
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms

In [2]:
from os import listdir
from os.path import isfile, join,splitext
t_dir = 'dataTrainComplete'
txt_fnames = [splitext(f)[0] for f in listdir(t_dir) if isfile(join(t_dir, f))]

In [3]:
crop_list = open('Keywords/02crop.list.csv', "r",encoding='UTF-8-sig')
crop = crop_list.read()
crop_line_sep = crop.splitlines()

pest_list = open('Keywords/02pest.list.csv', "r",encoding='UTF-8-sig')
pest = pest_list.read()
pest_line_sep = pest.splitlines()

chem_list = open('Keywords/02chem.list.csv', "r",encoding='UTF-8-sig')
chem = chem_list.read()
chem_line_sep = chem.splitlines()

In [4]:
from itertools import chain
import csv
vector_dict = {}
for idx,line in enumerate(chain(crop_line_sep,pest_line_sep,chem_line_sep)):
    l = line.split(',')
    for word in l:
        if(word == ''):continue
        jieba.add_word(word)
        vector_dict[word] = idx

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Renewrr\AppData\Local\Temp\jieba.cache
Loading model cost 0.688 seconds.
Prefix dict has been built successfully.


In [5]:
vectors = {}
for fname in txt_fnames:
    txt = open(t_dir+'/'+fname+'.txt', "r",encoding="utf-8")
    content = txt.read()
    seg_list = jieba.cut(content, cut_all=True)
    vectors[fname] = [0]*764
    for seg in seg_list:
        if(seg in vector_dict):
            vectors[fname][vector_dict[seg]] += 1

In [6]:
labels = {}
pair_list = []
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        else:
            labels[(fname1,fname2)] = 0
            pair_list.append((fname1,fname2))
corr_list = open('TrainLabel.csv', "r",encoding='UTF-8-sig')
corr = corr_list.read()
corr_line_sep = corr.splitlines()
for line in corr_line_sep[1:]:
    l = line.split(',')
    labels[(l[0],l[1])] = 1
print(len(labels))
print(sum(labels.values()))
print(len(pair_list))

313040
1383
313040


In [89]:
pos_pair_list = []
for line in corr_line_sep[1:]:
    l = line.split(',')
    pos_pair_list.append((l[0],l[1]))
neg_pair_list = []
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        if((fname1,fname2) not in pos_labels):
            neg_pair_list.append((fname1,fname2))
print(len(pos_pair_list)+len(neg_pair_list))

313040


In [110]:
# Dataset
import random
#random.sample(range(100), 10)
class PartDataset(data.Dataset):
    
    def __init__(self, pos_pair_list, neg_pair_list, vectors, phase='train'):
        self.l = len(pos_pair_list)
        neg_keys = random.sample(neg_pair_list,self.l)
        self.labels = {}
        for key in neg_keys:self.labels[key] = 0
        for key in pos_pair_list:self.labels[key] = 1
        self.pair_list = pos_pair_list+neg_keys
        self.vectors = vectors
        self.phase = phase
        
    def __len__(self):
        return self.l*2
    
    def __getitem__(self, idx):
        Test,Ref = self.pair_list[idx]
        label = self.labels[(Test,Ref)]
        comb_vector = self.vectors[Test] + self.vectors[Ref]
        return torch.tensor(comb_vector), label

In [111]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [112]:
train_dataset = PartDataset(pos_pair_list, neg_keys, vectors,phase='train')
test_dataset = PartDataset(pos_pair_list, neg_keys, vectors,phase='test')

In [113]:
print('Operation Check')
#print(labels[('3','415')])
#print(pair_list.index(('3','415')))
print(train_dataset.__getitem__(133886)[1])

Operation Check


IndexError: list index out of range

In [115]:
# DataLoader
train_dataloader = data.DataLoader(train_dataset, batch_size=20, shuffle=True)
test_dataloader = data.DataLoader(test_dataset, batch_size=20, shuffle=False)

dataloader_dict = {'train': train_dataloader, 'test': test_dataloader}

# Operation Check
print('Operation Check')
batch_iterator = iter(train_dataloader)
inputs, label = next(batch_iterator)
print(label)

Operation Check
tensor([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])


In [116]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2*764, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 20),
            nn.Sigmoid(),
            nn.Linear(20, 2),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [117]:
net = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=net.parameters(), lr=0.001, momentum=0.9)
print(net)

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=1528, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=20, bias=True)
    (5): Sigmoid()
    (6): Linear(in_features=20, out_features=2, bias=True)
  )
)


In [120]:
import time
def train_model(net, dataloader_dict, criterion, optimizer, num_epoch):
    
    since = time.time()
    best_model_wts = copy.deepcopy(net.state_dict())
    best_acc = 0.0
    best_lost = 100
    net = net.to(device)
    
    for epoch in range(num_epoch):
        print('Epoch {}/{}'.format(epoch + 1, num_epoch))
        print('-'*20)
        
        for phase in ['train', 'test']:
            
            if phase == 'train':
                net.train()
            else:
                net.eval()
                
            epoch_loss = 0.0
            epoch_corrects = 0
            
            for inputs, labels in tqdm(dataloader_dict[phase]):
                inputs = inputs.type(torch.FloatTensor).to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
                    
            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
            #if phase == 'test' and epoch_loss < best_lost:
                best_acc = epoch_acc
                #best_lost = epoch_loss
                best_model_wts = copy.deepcopy(net.state_dict())
                print(time.time())
                torch.save(net.state_dict(), 'best_checkpoint_last.pth')
                
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    net.load_state_dict(best_model_wts)
    return net

In [121]:
num_epoch = 20
net = train_model(net, dataloader_dict, criterion, optimizer, num_epoch)

Epoch 1/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0745 Acc: 0.9812


  0%|          | 0/139 [00:00<?, ?it/s]

test Loss: 0.0617 Acc: 0.9863
1638816773.9479706
Epoch 2/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0665 Acc: 0.9837


  0%|          | 0/139 [00:00<?, ?it/s]

test Loss: 0.0555 Acc: 0.9888
1638816775.092549
Epoch 3/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0604 Acc: 0.9845


  0%|          | 0/139 [00:00<?, ?it/s]

test Loss: 0.0532 Acc: 0.9906
1638816776.2164297
Epoch 4/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0578 Acc: 0.9837


  0%|          | 0/139 [00:00<?, ?it/s]

test Loss: 0.0499 Acc: 0.9888
Epoch 5/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0511 Acc: 0.9884


  0%|          | 0/139 [00:00<?, ?it/s]

test Loss: 0.0416 Acc: 0.9931
1638816778.4328642
Epoch 6/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0465 Acc: 0.9892


  0%|          | 0/139 [00:00<?, ?it/s]

test Loss: 0.0431 Acc: 0.9931
Epoch 7/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0425 Acc: 0.9910


  0%|          | 0/139 [00:00<?, ?it/s]

test Loss: 0.0375 Acc: 0.9917
Epoch 8/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0393 Acc: 0.9924


  0%|          | 0/139 [00:00<?, ?it/s]

test Loss: 0.0372 Acc: 0.9910
Epoch 9/20
--------------------


  0%|          | 0/139 [00:00<?, ?it/s]

train Loss: 0.0376 Acc: 0.9917


  0%|          | 0/139 [00:00<?, ?it/s]

KeyboardInterrupt: 