In [1]:
import pandas as pd
import collections
import string
from nltk.corpus import stopwords
import torch


In [2]:
train_data = pd.read_csv('hw1_train.csv')
vocabulary = collections.defaultdict(int)
for sen in train_data["UTTERANCE"]:
    words = sen.split()
    for word in words:
        vocabulary[word] += 1
vocabulary = {key : value for key,value in vocabulary.items() if not key.isdigit() and value > 0 and key not in string.punctuation}

word_to_index = {}
i = 0
for k in vocabulary.keys():
    word_to_index[k] = i
    i += 1

labels = train_data["CORE RELATIONS"]
splited_label_to_index = {}
i = 0
for label in labels:
    splited_label = label.split(" ")
    for sl in splited_label:
        if sl not in splited_label_to_index:
            splited_label_to_index[sl] = i
            i += 1


one_label = two_label = three_label = four_label = 0
for label in labels:
    if len(label.split(" ")) == 1:
        one_label += 1
    elif len(label.split(" ")) == 2:
        two_label += 1
    elif len(label.split(" ")) == 3:
        three_label += 1
    else:
        four_label += 1

print(one_label, two_label, three_label, four_label)

vocabulary_size = len(vocabulary)
label_set_size = len(splited_label_to_index)

index_to_label = {}
for k,v in splited_label_to_index.items():
    index_to_label[v] = k            
    
def prepare(sen,dic):
    res = [0]*len(dic)
    for word in sen.split():
        if word in dic:
            res[dic[word]] += 1
    res = torch.FloatTensor(res).view(1,-1).cuda()
    
    return res

def label_vectorize(sen):
    res = [0]*label_set_size
    for label in sen.split(" "):
        if label in splited_label_to_index:
            res[splited_label_to_index[label]] = 1
    res = torch.FloatTensor(res).view(1,-1).cuda()
    
    return res

2943 350 42 3


In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, tagset_size):
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, tagset_size)
        
    def forward(self, sentence):
        output = F.relu(self.fc1(sentence))
        output = F.relu(self.fc2(output))
        output = self.fc3(output)
        
        return torch.sigmoid(output)

In [4]:
import torch

model = MLP(vocabulary_size, vocabulary_size*2, (vocabulary_size + label_set_size) // 2, label_set_size)
model.to(torch.device("cuda:0"))
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

for epoch in range(20):
    total_loss = 0
    sentences = train_data["UTTERANCE"]
    label = train_data["CORE RELATIONS"]
    for i in range(len(train_data)):
        model.zero_grad()
        label_vec = label_vectorize(label[i]) 
        input_sen = prepare(sentences[i], word_to_index)
        if len(input_sen) == 0:
            print(sentences[i])
            continue
        prob = model(input_sen)
        loss = loss_function(prob, label_vec)
        total_loss += loss
        loss.backward()
        optimizer.step()
    
    print("epoch=", epoch, "loss=", total_loss/len(sentences))

epoch= 0 loss= tensor(0.0636, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 1 loss= tensor(0.0304, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 2 loss= tensor(0.0161, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 3 loss= tensor(0.0106, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 4 loss= tensor(0.0074, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 5 loss= tensor(0.0063, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 6 loss= tensor(0.0042, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 7 loss= tensor(0.0034, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 8 loss= tensor(0.0031, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 9 loss= tensor(0.0026, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 10 loss= tensor(0.0024, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 11 loss= tensor(0.0023, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 12 loss= tensor(0.0022, device='cuda:0', grad_fn=<DivBackward0>)
epoch= 13 loss= tensor(0.0018, device='cuda:0', grad_fn=<DivBackward0>)
ep

In [7]:
def dot(num, l):
    for i in range(len(l)):
        l[i] *= num
    
    return l

def detcet(output):
    one = two = three = four = 0
    for label in output:
        cnt = 1
        label = str(label)
        for ch in label:
            if ch == " ":
                cnt += 1
        if cnt == 1:
            one += 1
        elif cnt == 2:
            two += 1
        elif cnt == 3:
            three += 1
        else:
            four += 1
            
    return one, two, three, four

test_set = pd.read_csv('hw1_test.csv')
sentences = test_set['UTTERANCE']
output = []
with torch.no_grad():
    for i in range(len(sentences)):
        sentence = prepare(sentences[i], word_to_index)
        if len(sentence) == 0:
            output.append(['NO_REL'])
            continue
        trained_labels = model(sentence)
        trained_labels = list(trained_labels)[0]
        #print(trained_labels)
        res = []
        temp = []
        threshold = max(trained_labels)*0.8
        for i in range(len(trained_labels)):
            if trained_labels[i] >= threshold:
                temp.append((trained_labels[i], i))
        temp = sorted(temp, reverse=True)[:3]
        for x,y in temp:
            res.append(index_to_label[y])
        output.append(res)
print(output)
print(len(output))
print(detcet(output))


[['movie.media', 'movie.directed_by'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['NO_REL'], ['movie.starring.character', 'movie.starring.actor'], ['movie.starring.actor', 'movie.starring.character'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['NO_REL'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.directed_by', 'movie.synopsis'], ['NO_REL'], ['movie.starring.actor'], ['movie.starring.actor'], ['NO_REL'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.starring.actor'], ['movie.estimated_budget'], ['movie.estimated_budget'], ['movi


1084
(945, 129, 10, 0)


In [8]:
import csv
with open('Test.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter=' ')
    for val in output:
        writer.writerow(val)

In [11]:
[torch.randn(1, 3) for _ in range(5)] 

[tensor([[-0.8138, -0.2756,  0.6990]]),
 tensor([[ 0.1624, -2.3374, -0.9050]]),
 tensor([[-0.9322,  0.8019,  1.3185]]),
 tensor([[1.0118, 0.6845, 0.3738]]),
 tensor([[-0.4365,  1.4080,  0.5860]])]

In [2]:
import torch

tensor([0.7311, 0.9526, 0.8808, 0.9820, 0.0067, 0.2689, 0.0000])