In [1]:
import os
import io
import numpy as np
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
import nltk
nltk.download('punkt')
import string
from google.colab import drive
drive.mount('/content/gdrive')
train_data_path = os.path.join(os.getcwd(), "gdrive", "My Drive", "QT", "train.txt")
test_data_path = os.path.join(os.getcwd(), "gdrive", "My Drive", "QT", "test.txt")
file = os.path.join(os.getcwd(), "gdrive", "My Drive", "QT", "Tencent_AILab_ChineseEmbedding.txt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /content/gdrive


In [60]:
import gensim.downloader as api 
glove = api.load('glove-twitter-200')



In [101]:
from gensim.models.keyedvectors import KeyedVectors
wv_chinese = KeyedVectors.load_word2vec_format(file, binary=False)

In [2]:
with io.open(train_data_path, encoding='utf8') as training_f:
        training = training_f.read().split("\n")[0:-1]
with io.open(test_data_path, encoding='utf8') as testing_f:
        testing = testing_f.read().split("\n")[0:-1]

In [125]:
lst = str.split(training[18])
print(len(wv_chinese[lst]),len(lst))

49 49


# 1.Preprocess Data

In [3]:
UNK = "<UNK>"
def make_vocab(sentences):
    """make_vocab creates a set of vocab words that the model knows

    :param data: The list of documents that is used to make the vocabulary
    :type data: List[str]
    :returns: A set of strings corresponding to the vocabulary
    :rtype: Set[str]
    """
    vocab = set()
    for sentence in sentences:
      for word in sentence:
        vocab.add(word)
    return vocab
def make_indices(vocab):
	"""make_indices creates a 1-1 mapping of word and indices for a vocab.

	:param vocab: The strings corresponding to the vocabulary in train data.
	:type vocab: Set[str]
	:returns: A tuple containing the vocab, word2index, and index2word.
		vocab is a set of strings in the vocabulary including <UNK>.
		word2index is a dictionary mapping tokens to its index (0, ..., V-1)
		index2word is a dictionary inverting the mapping of word2index
	:rtype: Tuple[
		Set[str],
		Dict[str, int],
		Dict[int, str],
	]
	"""
	vocab_list = sorted(vocab)
	vocab_list.append(UNK)
	word2index = {}
	index2word = {}
	for index, word in enumerate(vocab_list):
		word2index[word] = index 
		index2word[index] = word 
	vocab.add(UNK)
	return vocab, word2index, index2word 
def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

In [4]:
def preprocess(training, testing):
  train_size, test_size = round(len(training)/6), round(len(testing)/6)
  train = []
  test = []
  max_src, max_ref, max_can = 0, 0, 0
  for i in range(train_size):
    idx = 6*i
    max_src = max(max_src, len(training[idx]))
    max_ref = max(max_ref, len(remove_punc(training[idx+1])))
    max_ref = max(max_ref, len(remove_punc(training[idx+1])))
    comb = [str.split(training[idx]),word_tokenize(remove_punc(training[idx+1])),word_tokenize(remove_punc(training[idx+2])),training[idx+3],training[idx+4]]
    train.append(comb)
  for i in range(test_size):
    idx = 6*i
    comb = [str.split(testing[idx]),word_tokenize(remove_punc(training[idx+1])),word_tokenize(remove_punc(training[idx+2])),testing[idx+3],testing[idx+4]]
    test.append(comb)
  return train, test

In [5]:
train, test  = preprocess(training,testing)
train_source = [i[0] for i in train]
train_refer = [i[1] for i in train]
train_candi = [i[2] for i in train]

In [None]:
#Create embedding matrix for both Chinese Characters and English Characters
e_vocab = make_vocab(train_refer + train_candi)
e_vocab, e_word2index, e_index2word = make_indices(e_vocab)
c_vocab = make_vocab(train_source)
c_vocab, c_word2index, c_index2word = make_indices(c_vocab)
e_matrix, c_matrix = [], []
e_len , c_len = len(e_vocab), len(c_vocab)
e_matrix = np.zeros((e_len, 200))
c_matrix = np.zeros((c_len,200))
for i, word in enumerate(e_vocab):
  if word in glove:
    e_matrix[i] = glove[word]
  else:
    e_matrix[i] = np.random.normal(scale=0.6, size=(200, ))
for i, word in enumerate(c_vocab):
  if word in wv_chinese:
    c_matrix[i] = wv_chinese[word]
  else:
    c_matrix[i] = np.random.normal(scale=0.6, size=(200, ))

In [9]:
path = F"/content/gdrive/My Drive/QT/"
e_vocab = make_vocab(train_refer + train_candi)
e_vocab, e_word2index, e_index2word = make_indices(e_vocab)
c_vocab = make_vocab(train_source)
c_vocab, c_word2index, c_index2word = make_indices(c_vocab)
e_matrix = np.load(path+"e_matrix.npy")
c_matrix = np.load(path+"c_matrix.npy")

In [10]:
def transform(data):
  x = []
  y = []
  for src, ref, can, score, label, in data:
    vecs_src=[]
    for word in src:
      if word in c_vocab:
        vecs_src.append(c_matrix[c_word2index[word]])
    vecs_src = np.array(vecs_src)
    vec_src = vecs_src.mean(axis=0)
    vecs_ref = []
    for word in ref:
      if word in e_vocab:
        vecs_ref.append(e_matrix[e_word2index[word]])
    vecs_ref = np.array(vecs_ref)
    vec_ref = vecs_ref.mean(axis=0)
    vecs_can = []
    for word in can:
      if word in e_vocab:
        vecs_can.append(e_matrix[e_word2index[word]])
    vecs_can = np.array(vecs_can)
    vec_can = vecs_can.mean(axis=0)
    comb = np.concatenate((vec_src,vec_ref), axis=None)
    comb = np.concatenate((comb,vec_can), axis=None)
    comb = np.concatenate((comb,score), axis=None)
    x.append(comb)
    y.append(1 if label=="H" else 0)
  return x,y


In [11]:
xTrain, yTrain =transform(train)
xTest, yTest = transform(test)

# 2.Model

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameters = { 
    'C': [0.01,0.1,1.0, 10,100,1000,10000],
    'gamma': [0.01,0.1,1,10,100, 'auto', 'scale']
}
model = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, n_jobs=-1).fit(xTrain, yTrain)

In [31]:
yPred = model.predict(xTest)
print(np.count_nonzero(yPred==yTest)/len(yTest))

0.6839080459770115


# 3.Evaluation

In [32]:
from sklearn.metrics import f1_score
print(f1_score(yPred,yTest))

0.7058823529411765
