# Authorship Identification

## Section 2.5: Basic Feature Engineering - Text b

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from joblib import Parallel, delayed

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_tran = pd.read_json('../data/data2/data_tran.json', orient='records', lines=True)
data_test = pd.read_json('../data/data2/data_test.json', orient='records', lines=True)

n_tran = data_tran.shape[0]
n_test = data_test.shape[0]

In [None]:
tfidf_encoder = TfidfVectorizer(ngram_range=(1, 2))
tfidf_encoder.fit(data_tran['text']) 

def get_top_tfidf_words(text, top_n):
    tfidf_array = tfidf_encoder.transform([text]).toarray()[0]
    top_indices = np.argsort(tfidf_array)[-top_n:][::-1] 
    return top_indices

In [4]:
def get_word_author_dict(data, top_n):

    num_authors = 21246
    word_author_dict = {author_id: {} for author_id in range(num_authors)}

    for _, row in tqdm(data.iterrows(), total=len(data)):
        text = row['text'] 
        authors = row['authors'] 

        top_words = get_top_tfidf_words(text, top_n) 

        for author in authors:
            if author >= 0: 
                for word_id in top_words:
                    if word_id in word_author_dict[author]:
                        word_author_dict[author][word_id] += 1
                    else:
                        word_author_dict[author][word_id] = 1

    return word_author_dict

word_author_dict = get_word_author_dict(data_tran, top_n=10)

  1%|          | 82/8460 [00:01<02:34, 54.26it/s]

100%|██████████| 8460/8460 [02:17<00:00, 61.34it/s]


In [5]:
def get_text_vetcor(coauthor_list, word_author_dict):

    result_array = np.zeros(100)

    for coauthor in coauthor_list:
        if coauthor not in word_author_dict:
            continue 

        common_words = word_author_dict[coauthor].keys()
        
        for main_author in range(100):
            if main_author not in word_author_dict:
                continue 
            
            for word in common_words:
                if word in word_author_dict[main_author]:
                    result_array[main_author] += word_author_dict[main_author][word]

    return result_array.reshape(1, 100)

In [6]:
def get_text_matrix(data, word_author_dict):
    vectors_list = Parallel(n_jobs=-1)(delayed(get_text_vetcor)(row['coauthors'], word_author_dict) for _, row in tqdm(data.iterrows(), total=len(data)))
    return np.concatenate(vectors_list, axis=0)

In [7]:
x_tran_text = get_text_matrix(data_tran, word_author_dict)
x_test_text = get_text_matrix(data_test, word_author_dict)

np.save('../data/data2/x_tran_text_b.npy', x_tran_text)
np.save('../data/data2/x_test_text_b.npy', x_test_text)

100%|██████████| 8460/8460 [1:28:47<00:00,  1.59it/s]
100%|██████████| 800/800 [08:03<00:00,  1.66it/s]
