In [1]:
import jieba
import numpy as np
import os, glob, time, copy, random, zipfile
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
import torchvision
from torchvision import models, transforms

In [2]:
from os import listdir
from os.path import isfile, join,splitext
t_dir = 'dataTrainComplete'
txt_fnames = [splitext(f)[0] for f in listdir(t_dir) if isfile(join(t_dir, f))] #Article filenames

In [3]:
crop_list = open('Keywords/02crop.list.csv', "r",encoding='UTF-8-sig')
crop = crop_list.read()
crop_line_sep = crop.splitlines()

pest_list = open('Keywords/02pest.list.csv', "r",encoding='UTF-8-sig')
pest = pest_list.read()
pest_line_sep = pest.splitlines()

chem_list = open('Keywords/02chem.list.csv', "r",encoding='UTF-8-sig')
chem = chem_list.read()
chem_line_sep = chem.splitlines()
#Keywords split by lines, keyword with more than one entry will be on the same line

In [4]:
from itertools import chain
import csv
vector_dict = {}
#Keyword lookup with keyword as key and vector index as value
for idx,line in enumerate(chain(crop_line_sep,pest_line_sep,chem_line_sep)):
    l = line.split(',')
    for word in l:
        #Some line will have more than one entry, which should have the same vector index
        if(word == ''):continue
        jieba.add_word(word)#Each keyword is added to jieba
        vector_dict[word] = idx

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Renewrr\AppData\Local\Temp\jieba.cache
Loading model cost 0.626 seconds.
Prefix dict has been built successfully.


In [5]:
vectors = {}
#Vector table with filename as key and vector as value
for fname in txt_fnames:
    txt = open(t_dir+'/'+fname+'.txt', "r",encoding="utf-8")
    content = txt.read()
    seg_list = jieba.cut(content, cut_all=True)
    vectors[fname] = [0]*764
    #Initialize an 0 vector for each file
    for seg in seg_list:
        if(seg in vector_dict):
            vectors[fname][vector_dict[seg]] += 1
            #Count keyword appearence - Bag of words

In [6]:
labels = {}
pair_list = []
#Deprecated
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        else:
            labels[(fname1,fname2)] = 0
            pair_list.append((fname1,fname2))
corr_list = open('TrainLabel.csv', "r",encoding='UTF-8-sig')
corr = corr_list.read()
corr_line_sep = corr.splitlines()
#Training label
for line in corr_line_sep[1:]:
    l = line.split(',')
    labels[(l[0],l[1])] = 1
print(len(labels))
print(sum(labels.values()))
print(len(pair_list))

313040
1383
313040


In [7]:
pos_pair_list = []
pos_labels = set() #faster lookup to filter out positive pairs, not used elsewhere
#All the associated article pairs given by train label
for line in corr_line_sep[1:]:
    l = line.split(',')
    pos_pair_list.append((l[0],l[1]))
    pos_labels.add((l[0],l[1]))
neg_pair_list = []
#Other pairs with no association
for fname1 in txt_fnames:
    for fname2 in txt_fnames:
        if(fname1 == fname2):continue
        if((fname1,fname2) not in pos_labels): #Filter out positive pairs
            neg_pair_list.append((fname1,fname2))
print(len(pos_pair_list)+len(neg_pair_list))

313040


In [8]:
# Dataset
import random
#Custom dataset, currently generates a 50/50 split of positive and negative sample
#To change the split, change the second variable of random.sample and the __len__ function accordingly
class PartDataset(data.Dataset):
    
    def __init__(self, pos_pair_list, neg_pair_list, vectors, phase='train'):
        self.l = len(pos_pair_list)#Number of positive pairs
        neg_keys = random.sample(neg_pair_list,self.l)#Sample negative pairs, change the second variable to change the split
        self.labels = {}
        for key in neg_keys:self.labels[key] = 0
        for key in pos_pair_list:self.labels[key] = 1
        self.pair_list = pos_pair_list+neg_keys
        self.vectors = vectors
        self.phase = phase
        
    def __len__(self):
        return self.l*2
    
    def __getitem__(self, idx):
        Test,Ref = self.pair_list[idx]
        label = self.labels[(Test,Ref)]
        comb_vector = self.vectors[Test] + self.vectors[Ref]
        return torch.tensor(comb_vector), label

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
train_dataset = PartDataset(pos_pair_list, neg_pair_list, vectors,phase='train')
test_dataset = PartDataset(pos_pair_list, neg_pair_list, vectors,phase='test')

In [11]:
#Deprecated
#print('Operation Check')
#print(labels[('3','415')])
#print(pair_list.index(('3','415')))
#print(train_dataset.__getitem__(133886)[1])

In [12]:
# DataLoader
train_dataloader = data.DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = data.DataLoader(test_dataset, batch_size=4, shuffle=False)

dataloader_dict = {'train': train_dataloader, 'test': test_dataloader}

# Operation Check
print('Operation Check')
batch_iterator = iter(train_dataloader)
inputs, label = next(batch_iterator)
print(label)

Operation Check
tensor([1, 1, 1, 0])


In [13]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2*764, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 20),
            nn.Sigmoid(),
            nn.Linear(20, 2),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [14]:
net = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=net.parameters(), lr=0.001, momentum=0.9)
#cross entropy loss and stochastic gradient descent
print(net)

NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=1528, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=20, bias=True)
    (5): Sigmoid()
    (6): Linear(in_features=20, out_features=2, bias=True)
  )
)


In [15]:
import time
def train_model(net, dataloader_dict, criterion, optimizer, num_epoch):
    
    since = time.time()
    best_model_wts = copy.deepcopy(net.state_dict())
    best_acc = 0.0
    net = net.to(device)
    
    for epoch in range(num_epoch):
        print('Epoch {}/{}'.format(epoch + 1, num_epoch))
        print('-'*20)
        
        for phase in ['train', 'test']:
            
            if phase == 'train':
                net.train()
            else:
                net.eval()
                
            epoch_loss = 0.0
            epoch_corrects = 0
            
            #tqdm for progress bar
            for inputs, labels in tqdm(dataloader_dict[phase]):
                inputs = inputs.type(torch.FloatTensor).to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
                    
            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            # deep copy the model
            if phase == 'test' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(net.state_dict())
                print(time.time())
                torch.save(net.state_dict(), 'best_checkpoint_last.pth')
                
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    net.load_state_dict(best_model_wts)
    return net

In [16]:
num_epoch = 20
net = train_model(net, dataloader_dict, criterion, optimizer, num_epoch)

Epoch 1/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6994 Acc: 0.5043


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6908 Acc: 0.5000
1639181217.5173287
Epoch 2/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6876 Acc: 0.5571


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6866 Acc: 0.5000
Epoch 3/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.6670 Acc: 0.6475


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.6383 Acc: 0.6782
1639181223.9052901
Epoch 4/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.5424 Acc: 0.8033


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.4385 Acc: 0.8185
1639181227.2376535
Epoch 5/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.3591 Acc: 0.8608


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.2902 Acc: 0.8970
1639181230.5325344
Epoch 6/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.2474 Acc: 0.9158


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.2057 Acc: 0.9382
1639181233.1630597
Epoch 7/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.1734 Acc: 0.9443


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1591 Acc: 0.9505
1639181235.6162887
Epoch 8/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.1348 Acc: 0.9584


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1640 Acc: 0.9393
Epoch 9/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.1053 Acc: 0.9718


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1165 Acc: 0.9617
1639181240.5757418
Epoch 10/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0833 Acc: 0.9772


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1296 Acc: 0.9613
Epoch 11/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0729 Acc: 0.9794


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1994 Acc: 0.9244
Epoch 12/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0699 Acc: 0.9805


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1004 Acc: 0.9696
1639181247.8779778
Epoch 13/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0551 Acc: 0.9855


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1240 Acc: 0.9606
Epoch 14/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0540 Acc: 0.9845


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.0938 Acc: 0.9714
1639181252.784392
Epoch 15/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0446 Acc: 0.9877


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1000 Acc: 0.9685
Epoch 16/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0455 Acc: 0.9870


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.0885 Acc: 0.9725
1639181257.7905793
Epoch 17/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0376 Acc: 0.9892


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1004 Acc: 0.9693
Epoch 18/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0386 Acc: 0.9888


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1065 Acc: 0.9678
Epoch 19/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0280 Acc: 0.9935


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.0842 Acc: 0.9761
1639181265.895521
Epoch 20/20
--------------------


  0%|          | 0/692 [00:00<?, ?it/s]

train Loss: 0.0241 Acc: 0.9939


  0%|          | 0/692 [00:00<?, ?it/s]

test Loss: 0.1458 Acc: 0.9559
Training complete in 0m 56s
Best val Acc: 0.976139
