## Data Processor: Pre-Processing & Representation Learning

### Import Packages

In [3]:
import numpy as np
import pandas as pd
import json
import random
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Data Processor Class Object

In [68]:
class AuthorshipData:

    def __init__(self, train_file, test_file):

        # Read training data & testing data
        self.train_data = self.read_data(train_file)
        self.test_data = self.read_data(test_file)

        # Initially process data in one-hot encoding
        self.label = self.process_label(self.train_data)

        self.feature_train = self.process_feature(self.train_data)
        self.feature_test = self.process_feature(self.test_data)

        self.f_coauthor_tn, self.f_text_tn, self.f_year_tn, self.f_venue_tn = self.feature_train
        self.f_coauthor_ts, self.f_text_ts, self.f_year_ts, self.f_venue_ts = self.feature_test

    # Read data in initialization
    def read_data(self, file_name):
        with open(file_name, 'r', encoding='utf8') as data:
            json_data = json.load(data)
        output_data = pd.DataFrame(json_data)
        return output_data

    # Process data in one-hot in initialization
    def process_label(self, data):

        n_author = 101
        threshold_author = 100

        ath_list = []

        for authors in data["authors"]:

            authors = np.array(authors)
            ath_id = authors[authors < threshold_author]

            ath_vector = np.zeros(n_author)
            if len(ath_id) == 0:
                ath_vector[-1] = 1
            else:
                ath_vector[ath_id] = 1
            ath_list.append(ath_vector)

        ath_list = np.array(ath_list)

        return ath_list

    def process_feature(self, data):

        # Coauthor
        if "authors" in data.columns:
            author_column = "authors"
        else:
            author_column = "coauthors"

        n_coauthor = 21247

        cth_list = []

        for authors in data[author_column]:

            authors = np.array(authors)
            cth_id = authors

            cth_vector = np.zeros(n_coauthor)
            if len(cth_id) == 0:
                cth_vector[-1] = 1
            else:
                cth_vector[cth_id] = 1

            cth_list.append(cth_vector)

        cth_list = np.array(cth_list)

        # Title & Abstract
        n_text = 5000

        text_list = []

        for i in range(len(data)):
            title = data["title"][i]
            abstract = data["abstract"][i]
            text = title + abstract

            text_vector = np.zeros(n_text)
            text_vector[text] = 1
            text_list.append(text_vector)

        text_list = np.array(text_list)

        # Year
        n_year = 20
        year_list = []

        for year in data["year"]:
            year_vector = np.zeros(n_year)
            year_vector[year] = 1
            year_list.append(year_vector)

        year_list = np.array(year_list)

        # Vneue
        n_venue = 466
        ven_list = []

        for venue in data["venue"]:
            ven_vector = np.zeros(n_venue)
            if venue == "":
                ven_vector[-1] = 1
            else:
                ven_vector[venue] = 1
            ven_list.append(ven_vector)

        ven_list = np.array(ven_list)

        return cth_list, text_list, year_list, ven_list

    # Extract data
    def extract_data(self, extend_factor=1, center = False, scale = False):

        label = self.label

        feature_train = (self.f_coauthor_tn, self.f_text_tn, self.f_year_tn, self.f_venue_tn)
        feature_test = (self.f_coauthor_ts, self.f_text_ts, self.f_year_ts, self.f_venue_ts)

        feature_train = np.concatenate(feature_train,axis=1)
        feature_test = np.concatenate(feature_test,axis=1)

        n_index = np.where(label[:, -1] == 1)[0]
        p_index = np.where(label[:, -1] != 1)[0]
        x_n = feature_train[n_index]
        x_p = feature_train[p_index]
        y_n = label[n_index]
        y_p = label[p_index]
        x_pos_new = np.repeat(x_p, extend_factor, axis=0)
        y_pos_new = np.repeat(y_p, extend_factor, axis=0)
        label = np.concatenate((y_pos_new, y_n), axis=0)
        feature_train = np.concatenate((x_pos_new, x_n), axis=0)

        id_shuffle = list(range(label.shape[0]))
        random.shuffle(id_shuffle)
        label = label[id_shuffle]
        feature_train = feature_train[id_shuffle]

        if center and scale:
            scaler = StandardScaler(with_mean=True, with_std=True)
            feature_train = scaler.fit_transform(feature_train)
            feature_test = scaler.fit_transform(feature_train)
        elif center and not scale:
            scaler = StandardScaler(with_mean=True, with_std=False)
            feature_train = scaler.fit_transform(feature_train)
            feature_test = scaler.fit_transform(feature_train)
        elif not center and scale:
            scaler = StandardScaler(with_mean=False, with_std=True)
            feature_train = scaler.fit_transform(feature_train)
            feature_test = scaler.fit_transform(feature_train)

        return label, feature_train, feature_test

    # Representation learning methods of feature coauthor
    def transform_coauthor(self, method):
        if method == "rescale":
            self.transform_coauthor_rescale()
        elif method == "pca":
            self.transform_coauthor_rescale()
            self.transform_coauthor_pca()
        elif method == "pe":
            self.transform_coauthor_pe()
        elif method == "onehot":
            pass
        else:
            print("Invalid method name. Transform by default method: one-hot")

    def transform_coauthor_rescale(self):

        n_author = 101
        threshold_author = 100

        cooperation_list = []

        for author in self.train_data["authors"]:
            author = np.array(author)
            prolific_author = author[author < threshold_author]
            co_author = author[author >= threshold_author]
            if prolific_author.size != 0:
                for i in co_author:
                    if i not in cooperation_list:
                        cooperation_list.append(i)

        cooperation_list.sort()

        cth_list_train = []

        for authors in self.train_data["authors"]:
            authors = np.array(authors)
            cth_id = authors[authors >= threshold_author]
            cth_vector = np.zeros(6576)
            if len(cth_id) != 0:
                for j in cth_id:
                    if j in cooperation_list:
                        cth_vector[cooperation_list.index(j)] = 1
                    else:
                        cth_vector[-1] = 1
            else:
                cth_vector[-1] = 1
            cth_list_train.append(cth_vector)

        cth_list_train = np.array(cth_list_train)

        self.f_coauthor_tn = cth_list_train

        cth_list_test = []

        for authors in self.test_data["coauthors"]:
            authors = np.array(authors)
            cth_id = authors[authors >= 100]
            cth_vector = np.zeros(6576)
            if len(cth_id) != 0:
                for j in cth_id:
                    if j in cooperation_list:
                        cth_vector[cooperation_list.index(j)] = 1
                    else:
                        cth_vector[-1] = 1
            else:
                cth_vector[-1] = 1
            cth_list_test.append(cth_vector)

        cth_list_test = np.array(cth_list_test)

        self.f_coauthor_ts = cth_list_test

    def transform_coauthor_pe(self):
        seq_len = 21247
        d = 500
        n = 10000
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        c_vec = P

        train_data_coauthor = []
        for i in self.train_data["authors"]:
            for j in i:
                if j < 100:
                    i.remove(j)
            if len(i) == 0:
                train_data_coauthor.append([21146])
            else:
                train_data_coauthor.append(i)

        test_data_coauthor = list(self.test_data["coauthors"])

        coauthor_train = []
        for c in train_data_coauthor:
            ca = c_vec[c]
            coauthor_train.append(np.mean(ca, axis=0))
        coauthor_train = np.array(coauthor_train)

        self.f_coauthor_tn = coauthor_train

        coauthor_test = []
        for c in test_data_coauthor:
            ca = c_vec[c]
            coauthor_test.append(np.mean(ca, axis=0))
        coauthor_test = np.array(coauthor_test)

        self.f_coauthor_ts = coauthor_test

    def transform_coauthor_pca(self):
        feature = self.f_coauthor_tn

        npc = round(feature.shape[1] / 2)
        pca1 = PCA(n_components=npc)
        pca1.fit(feature)
        pca_ratio = pca1.explained_variance_ratio_
        for a in range(len(pca_ratio)):
            if sum(pca_ratio[0:a]) > 0.9:
                npc = a
                break
        pca2 = PCA(n_components=npc+1)
        pca2.fit(feature)
        feature_pca = pca2.transform(feature)

        self.f_coauthor_tn = feature_pca
        self.f_coauthor_ts = pca2.transform(self.f_coauthor_ts)

    # Representation learning methods of feature text
    def transform_text(self, method):
        if method == "w2v":
            self.transform_text_w2v()
        elif method == "d2v":
            self.transform_text_d2v()
        elif method == "pe":
            self.transform_text_pe()
        elif method == "pca":
            self.transform_text_pca()
        elif method == "onehot":
            pass
        else:
            print("Invalid method name. Transform by default method: one-hot")

    def transform_text_w2v(self):

        titles_tn = list(self.train_data["title"].copy())
        abstracts_tn = list(self.train_data["abstract"].copy())

        sentences = []
        for i in range(len(titles_tn)):
            text = titles_tn[i] + abstracts_tn[i]
            sentences.append(text)

        n_vector = 100

        model = Word2Vec(sentences=sentences, vector_size=n_vector, min_count=1)
        words_vec = model.wv

        text_list_tn = []
        for i in range(len(titles_tn)):
            text = titles_tn[i] + abstracts_tn[i]
            text_vector = np.array(words_vec[text])
            text_list_tn.append(np.mean(text_vector, axis=0))
        text_list_tn = np.array(text_list_tn)

        self.f_text_tn = text_list_tn

        titles_ts = list(self.test_data["title"].copy())
        abstracts_ts = list(self.test_data["abstract"].copy())

        text_list_ts = []
        for i in range(len(titles_ts)):
            text = titles_ts[i] + abstracts_ts[i]
            text_vector = np.array(words_vec[text])
            text_list_ts.append(np.mean(text_vector, axis=0))
        text_list_ts = np.array(text_list_ts)

        self.f_text_ts = text_list_ts

    def transform_text_d2v(self):

        titles_tn = self.train_data["title"]
        abstracts_tn = self.train_data["abstract"]

        text_base = []
        for i in range(len(titles_tn)):
            text_base.append(titles_tn[i] + abstracts_tn[i])

        document = [TaggedDocument(doc, [i]) for i, doc in enumerate(text_base)]

        n_vector = 100

        model = Doc2Vec(document, vector_size=n_vector, window=2, min_count=1)
        words_vector = []
        for i in range(len(text_base)):
            words_vector.append(list(model.infer_vector(np.asarray(text_base[i], dtype="str"))))
        words_vector = np.array(words_vector)

        text_list_tn = []
        for i in range(len(titles_tn)):
            text = titles_tn[i] + abstracts_tn[i]
            text_vector = np.array(words_vector[text])
            text_list_tn.append(np.mean(text_vector, axis=0))
        text_list_tn = np.array(text_list_tn)

        self.f_text_tn = text_list_tn

        titles_ts = self.test_data["title"]
        abstracts_ts = self.test_data["abstract"]

        text_list_ts = []
        for i in range(len(titles_ts)):
            text = titles_ts[i] + abstracts_ts[i]
            text_vector = np.array(words_vector[text])
            text_list_ts.append(np.mean(text_vector, axis=0))
        text_list_ts = np.array(text_list_ts)

        self.f_text_ts = text_list_ts

    def transform_text_pe(self):
        seq_len = 5000
        d = 100
        n = 10000
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        t_vec = P

        train_data_text = []
        for i in range(len(self.train_data)):
            train_data_text.append(list(self.train_data["title"][i]) + list(self.train_data["abstract"][i]))

        test_data_text = []
        for i in range(len(self.test_data)):
            test_data_text.append(list(self.test_data["title"][i]) + list(self.test_data["abstract"][i]))

        text_train = []
        for t in train_data_text:
            tx = t_vec[t]
            text_train.append(np.mean(tx, axis=0))
        text_train = np.array(text_train)


        self.f_text_tn = text_train

        text_test = []
        for t in test_data_text:
            tx = t_vec[t]
            text_test.append(np.mean(tx, axis=0))
        text_test = np.array(text_test)

        self.f_text_ts = text_test

    def transform_text_pca(self):
        feature = self.f_text_tn

        npc = round(feature.shape[1] / 2)
        pca1 = PCA(n_components=npc)
        pca1.fit(feature)
        pca_ratio = pca1.explained_variance_ratio_
        for a in range(len(pca_ratio)):
            if sum(pca_ratio[0:a]) > 0.9:
                npc = a
                break
        pca2 = PCA(n_components=npc+1)
        pca2.fit(feature)
        feature_pca = pca2.transform(feature)

        self.f_text_tn = feature_pca
        self.f_text_ts = pca2.transform(self.f_text_ts)

    # Representation learning methods of feature venue
    def transform_venue(self, method):
        if method == "pe":
            self.transform_venue_pe()
        elif method == "pca":
            self.transform_venue_pca()
        elif method == "onehot":
            pass
        else:
            print("Invalid method name. Transform by default method: one-hot")

    def transform_venue_pe(self):

        seq_len = 466
        d = 2
        n = 10000
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        v_vec = P

        venue_train = []
        for v in self.train_data["venue"]:
            if v == "":
                vn = v_vec[-1]
            else:
                vn = v_vec[v]
            venue_train.append(vn)
        venue_train = np.array(venue_train)

        self.f_venue_tn = venue_train

        venue_test = []
        for v in self.test_data["venue"]:
            if v == "":
                vn = v_vec[-1]
            else:
                vn = v_vec[v]
            venue_test.append(vn)
        venue_test = np.array(venue_test)

        self.f_venue_ts = venue_test

    def transform_venue_pca(self):

        feature = self.f_venue_tn

        npc = round(feature.shape[1] / 2)
        pca1 = PCA(n_components=npc)
        pca1.fit(feature)
        pca_ratio = pca1.explained_variance_ratio_
        for a in range(len(pca_ratio)):
            if sum(pca_ratio[0:a]) > 0.9:
                npc = a
                break
        pca2 = PCA(n_components=npc+1)
        pca2.fit(feature)

        self.f_venue_tn = pca2.transform(self.f_venue_tn)
        self.f_venue_ts = pca2.transform(self.f_venue_ts)

    # Representation learning methods of feature year
    def transform_year(self, method):
        if method == "pe":
            self.transform_year_pe()
        elif method == "pca":
            self.transform_year_pca()
        elif method == "numeric":
            self.transform_year_numeric()
        elif method == "onehot":
            pass
        else:
            print("Invalid method name. Transform by default method: one-hot")

    def transform_year_numeric(self):
        self.f_year_tn = np.array(self.train_data["year"]).reshape(len(self.train_data["year"]),1)
        self.f_year_ts = np.array(self.test_data["year"]).reshape(len(self.test_data["year"]),1)

    def transform_year_pe(self):

        seq_len = 20
        d = 10
        n = 10000
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        y_vec = P

        year_train = []
        for y in self.train_data["year"]:
            yr = y_vec[y]
            year_train.append(yr)
        year_train = np.array(year_train)

        self.f_year_tn = year_train

        year_test = []
        for y in self.test_data["year"]:
            if y == "":
                yr = y_vec[-1]
            else:
                yr = y_vec[y]
            year_test.append(yr)
        year_test = np.array(year_test)

        self.f_year_ts = year_test

    def transform_year_pca(self):

        feature = self.f_year_tn

        npc = round(feature.shape[1] / 2)
        pca1 = PCA(n_components=npc)
        pca1.fit(feature)
        pca_ratio = pca1.explained_variance_ratio_
        for a in range(len(pca_ratio)):
            if sum(pca_ratio[0:a]) > 0.9:
                npc = a
                break
        pca2 = PCA(n_components=npc+1)
        pca2.fit(feature)

        self.f_year_tn = pca2.transform(self.f_year_tn)
        self.f_year_ts = pca2.transform(self.f_year_ts)

### Data Processor Object

In [69]:
authorship_data = AuthorshipData(train_file="Data/train.json", test_file="Data/test.json")

### Representation Learning

#### Transform feature coauthor: Choose one from "onehot",  "pe", "pca", or "rescale"

In [None]:
authorship_data.transform_coauthor(method="pca")

#### Transform feature text: Choose one from "onehot", "w2v", "d2v", "pe", or "pca"

In [70]:
authorship_data.transform_text(method="w2v")

#### Transform feature year: Choose one from "onehot", "pe", "pca", or "numeric"

In [6]:
authorship_data.transform_year(method="numeric")

#### Transform feature venue: Choose one from "onehot", "pe" or "pca"

In [7]:
authorship_data.transform_venue(method="pe")

### Extract & Store Data

In [8]:
y_train, x_train, x_test = authorship_data.extract_data(extend_factor = 2, center = True, scale = True)
np.save("y_train_new.npy", y_train)
np.save("x_train_new.npy", x_train)
np.save("x_test_new.npy", x_test)