# Authorship Identification

## Section 3.1: Textual Feature Engineering - Doc2Vec

In [1]:
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
data_tran = pd.read_json('../data/data2/data_tran.json', orient='records', lines=True)
data_test = pd.read_json('../data/data2/data_test.json', orient='records', lines=True)

n_tran = data_tran.shape[0]
n_test = data_test.shape[0]

### 1. Train Model

In [None]:
corpus_title = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(data_tran["title_text"])]
model_doc2vec_title = Doc2Vec(vector_size=100, min_count=1, epochs=100, workers=8, dm=1)
model_doc2vec_title.build_vocab(corpus_title)
model_doc2vec_title.train(corpus_title, total_examples=model_doc2vec_title.corpus_count, epochs=model_doc2vec_title.epochs)
model_doc2vec_title.save("../model/model_doc2vec/model_doc2vec_title.bin")

In [4]:
corpus_abstract = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(data_tran["abstract_text"])]
model_doc2vec_abstract = Doc2Vec(vector_size=200, min_count=1, epochs=100, workers=8, dm=1)
model_doc2vec_abstract.build_vocab(corpus_abstract)
model_doc2vec_abstract.train(corpus_abstract, total_examples=model_doc2vec_abstract.corpus_count, epochs=model_doc2vec_abstract.epochs)
model_doc2vec_abstract.save("../model/model_doc2vec/model_doc2vec_abstract.bin")

### 2. Load Model

In [5]:
model_doc2vec_title = Doc2Vec.load('../model/model_doc2vec/model_doc2vec_title.bin')
model_doc2vec_abstract = Doc2Vec.load('../model/model_doc2vec/model_doc2vec_abstract.bin')

### 3. Document Embeddings

In [None]:
x_tran_title_doc2vec = np.array([model_doc2vec_title.infer_vector(text.split()) for text in data_tran['title_text']])
x_test_title_doc2vec = np.array([model_doc2vec_title.infer_vector(text.split()) for text in data_test['title_text']])

x_tran_abstract_doc2vec = np.array([model_doc2vec_abstract.infer_vector(text.split()) for text in data_tran['abstract_text']])
x_test_abstract_doc2vec = np.array([model_doc2vec_abstract.infer_vector(text.split()) for text in data_test['abstract_text']])

In [None]:
np.save('../data/data2/x_tran_title_doc2vec.npy', x_tran_title_doc2vec)
np.save('../data/data2/x_test_title_doc2vec.npy', x_test_title_doc2vec)

np.save('../data/data2/x_tran_abstract_doc2vec.npy', x_tran_abstract_doc2vec)
np.save('../data/data2/x_test_abstract_doc2vec.npy', x_test_abstract_doc2vec)