In [1]:
import os
import numpy as np
# function of read data
def _read_txt_(url):
    file = open(url, 'r', encoding='utf-8')
    seg_list = []
    lines = file.readlines()
    for i in range(len(lines)):
        seg_list.append(lines[i].rstrip("\n").split('\t'))
        pass
    file.close()
    return seg_list

# replace Capital word with <Capital>
# replace SUPPER word with <SUPPER>
# replace digit word, like 1997, with <digit>
# replace connect-word word with <connect>
def _clean_data_(data):
    list_idx_remove = []
    for idx, x in enumerate(data):
        # ignore label
        words = x[1].split(" ")
        words.insert(0, "<s>")
        words.append("<\s>")
        for i, word in enumerate(words):
            if word.isupper():
                words[i] = "<SUPPER>"
                continue
            if word.istitle():
                words[i] = "<Capital>"
                continue
            if word.isdigit():
                words[i] = "<digit>"
                continue
            if len(word) > 1 and word.find("-") != -1:
                words[i] = "<connect>"
        data[idx][1] = words 
    return data
# define unique word list
# This dict is for the test set to define unk
# simply speaking, the word not in this dict, and frequency < 2 should be defined as unk.
def _unique_list_(data):
    unique_word_frequency = {}
    for _, x in enumerate(data):
        words = x[1]
        for _, word in enumerate(words):
            if word not in unique_word_frequency:
                unique_word_frequency[word] = 1
            else:
                num = unique_word_frequency[word]
                unique_word_frequency[word] = num + 1
    return unique_word_frequency

# replace word whose frequency < 2 with <UNK>
def _UNK_(data, refer_dict):
    for idx, x in enumerate(data):
        words = x[1]
        for i, word in enumerate(words):
            if refer_dict[word] < 2:
                words[i] = "<UNK>"
        data[idx][1] = words
    return data
            
train_list = _read_txt_("titles-en-train.labeled")
train_list = _clean_data_(train_list)
unique_word_frequency = _unique_list_(train_list)
train_list = _UNK_(train_list, unique_word_frequency)

In [2]:
# initialize perceptron
# feature is 3-gram
N = 3
weight_dict = {}
# initialize weight_dict
file_name = "perceptron_3_gram_initial.npy"
if os.path.exists(file_name):
    weight_dict = np.load(file_name,allow_pickle=True).item()
else:
    for _, x in enumerate(train_list):
        words = x[1]
        for i in range(len(words)):
            if (i + N) >= len(words) + 1: continue
            windows = words[i:i+N]
            combination = ""
            for j in range(N):
                if j == 0:
                    combination += windows[j]
                else:
                    combination += "|" + windows[j]
            if combination not in weight_dict:
                weight_dict[combination] = 0
    np.save(file_name, weight_dict)

In [3]:
# train perceptron
# feature is 3-gram
N = 3
# weight_dict = {}
# train
file_name = "perceptron_3_gram_trained.npy"
if os.path.exists(file_name):
    weight_dict = np.load(file_name,allow_pickle=True).item()
else:
    for idx, x in enumerate(train_list):
        label = int(x[0])
        words = x[1]
        for i in range(len(words)):
            if (i + N) >= len(words) + 1: continue
            windows = words[i:i+N]
            combination = ""
            for j in range(N):
                if j == 0:
                    combination += windows[j]
                else:
                    combination += "|" + windows[j]
            num = weight_dict[combination]
            weight_dict[combination] = num + label
    np.save(file_name, weight_dict)
    
# log scaling (standardization)
import math
for key in weight_dict.keys():
    num = weight_dict[key]
    if num > 0:
        weight_dict[key] = math.log10(num)
    if num < 0:
        weight_dict[key] = -math.log10(abs(num))

In [4]:
# test
test_list = _read_txt_("titles-en-test.labeled")
test_list = _clean_data_(test_list)
for idx, x in enumerate(test_list):
    words = x[1]
    for i, word in enumerate(words):
        if word not in unique_word_frequency:
            words[i] = "<UNK>"
    test_list[idx][1] = words
    
total = len(test_list)
correct = 0
incorrect_idx_list = []
for idx, x in enumerate(test_list):
    label = int(x[0])
    words = x[1]
    scores = 0
    for i in range(len(words)):
        if (i + N) >= len(words) + 1: continue
        windows = words[i:i+N]
        combination = ""
        for j in range(N):
            if j == 0:
                combination += windows[j]
            else:
                combination += "|" + windows[j]
        if combination in weight_dict:
            scores += weight_dict[combination]
        else:
            scores += -1
    if label == 1:
        if scores > 0:
            correct += 1
        else:
            incorrect_idx_list.append(idx)
    else:
        if scores <= 0:
            correct += 1
        else:
            incorrect_idx_list.append(idx)
            
print("acc: " + str(round(correct/total,3)*100) + "%")

acc: 91.4%


In [5]:
incorrect_idx_list[0:10]

[1, 3, 7, 12, 13, 18, 29, 41, 42, 44]

In [6]:
test_list[1]

['1',
 ['<s>',
  '<Capital>',
  '(',
  'also',
  'called',
  '<Capital>',
  ')',
  'means',
  'a',
  'person',
  'who',
  'was',
  'the',
  'biological',
  'mother',
  'of',
  'an',
  '<Capital>',
  'and',
  'consort',
  'of',
  'the',
  'previous',
  '<Capital>',
  '.',
  '<\\s>']]