# Authorship Identification

## Section 3.2: Textual Feature Engineering - Word2Vec

In [1]:
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### 1. Train Model

In [2]:
data_tran = pd.read_json('../data/data2/data_tran.json', orient='records', lines=True)
data_test = pd.read_json('../data/data2/data_test.json', orient='records', lines=True)

n_tran = data_tran.shape[0]
n_test = data_test.shape[0]

In [3]:
corpus_title = [text.split() for text in data_tran["title_text"]]
model_word2vec_title = Word2Vec(sentences=corpus_title, vector_size=100, window=5, min_count=1, workers=6, sg=1, epochs=100)
model_word2vec_title.save("../model/model_word2vec/model_word2vec_title.bin")

In [4]:
corpus_abstrct = [text.split() for text in data_tran["abstract_text"]]
model_word2vec_abstrct = Word2Vec(sentences=corpus_abstrct, vector_size=200, window=5, min_count=1, workers=6, sg=1, epochs=100)
model_word2vec_abstrct.save("../model/model_word2vec/model_word2vec_abstract.bin")

### 2. Load Model

In [5]:
model_word2vec_title = Word2Vec.load("../model/model_word2vec/model_word2vec_title.bin")
model_word2vec_abstrct = Word2Vec.load("../model/model_word2vec/model_word2vec_abstract.bin")

x_tran_title_word_vectors = {word: model_word2vec_title.wv[word] for word in model_word2vec_title.wv.index_to_key}
x_tran_abstract_word_vectors = {word: model_word2vec_abstrct.wv[word] for word in model_word2vec_abstrct.wv.index_to_key}

### 3. Word Embeddings

In [6]:
import json

x_tran_title_word_vectors_json = {word: vector.tolist() for word, vector in x_tran_title_word_vectors.items()}
x_tran_abstract_word_vectors_json = {word: vector.tolist() for word, vector in x_tran_abstract_word_vectors.items()}

with open("../data/data2/x_tran_title_word_vectors.json", "w") as f:
    json.dump(x_tran_title_word_vectors_json, f)

with open("../data/data2/x_tran_abstract_word_vectors.json", "w") as f:
    json.dump(x_tran_abstract_word_vectors_json, f)