# Authorship Identification

## Section 1: Data Loading & Preprocessing

In [1]:
import numpy as np
import pandas as pd
import math

### 1. Load Data from JSON Files

In [2]:
data_tran = pd.read_json("/root/COMP90051/data1/train.json")
data_test = pd.read_json("/root/COMP90051/data1/test.json")

### 2. Data Preprocessing

#### 2.1 Create the 'coauthors' column in training data

In [3]:
data_tran['author']    = data_tran['authors'].apply(lambda x: [a for a in x if 0 <= a < 100])
data_tran['coauthors'] = data_tran['authors'].apply(lambda x: [a for a in x if a >= 100])
data_tran['author']    = data_tran['author'].apply(lambda x: x if x else [-1])

#### 2.2 Create the string format of title & abstract

In [4]:
def list_to_string(lst):
    return ' '.join(map(str, lst))

In [5]:
data_tran['title_text'] = data_tran['title'].apply(list_to_string)
data_test['title_text'] = data_test['title'].apply(list_to_string)

data_tran['abstract_text'] = data_tran['abstract'].apply(list_to_string)
data_test['abstract_text'] = data_test['abstract'].apply(list_to_string)

#### 2.3 Create the 'text' column in training and testing data, which is the string format that merges the title and abstract

In [6]:
data_tran['text'] = (data_tran['title'].apply(list_to_string) + ' ' + data_tran['abstract'].apply(list_to_string))
data_test['text'] = (data_test['title'].apply(list_to_string) + ' ' + data_test['abstract'].apply(list_to_string))

#### 2.4 Fill NA in venue

In [7]:
data_tran['venue'] = data_tran['venue'].replace('', 465).fillna(465).infer_objects(copy=False)
data_test['venue'] = data_test['venue'].replace('', 465).fillna(465).infer_objects(copy=False)

  data_tran['venue'] = data_tran['venue'].replace('', 465).fillna(465).infer_objects(copy=False)
  data_test['venue'] = data_test['venue'].replace('', 465).fillna(465).infer_objects(copy=False)


#### 2.5 Reduce the training data since many data in training data dose not have main author

In [8]:
def reduce_data_in_author(data_tran, target_count=1000):
    
    # 找到 'authors' 列为 [-1] 的所有行
    negative_rows = data_tran[data_tran['author'].apply(lambda x: x == [-1])]

    # 如果负样本数量超过目标数量，则随机保留 target_count 个样本
    if len(negative_rows) > target_count:
        negative_rows = negative_rows.sample(n=target_count, random_state=42)

    # 过滤掉所有 [-1] 的行，并添加保留的部分回原 DataFrame
    remaining_rows = data_tran[data_tran['author'].apply(lambda x: x != [-1])]
    final_data_tran = pd.concat([remaining_rows, negative_rows], axis=0)

    # 重置索引
    return final_data_tran.reset_index(drop=True)


In [9]:
data_tran = reduce_data_in_author(data_tran)

### 3. Present the Preprocessed Data

#### 3.1 Present Training Data

In [10]:
data_tran.head(5)

Unnamed: 0,authors,year,abstract,venue,title,author,coauthors,title_text,abstract_text,text
0,"[42, 13720, 36]",9,"[2455, 1858, 2335, 1543, 1800, 1860, 2000, 286...",20,"[41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1...","[42, 36]",[13720],41 1550 1563 1594 1544 1919 1644 37 1539 1715 ...,2455 1858 2335 1543 1800 1860 2000 2867 1546 1...,41 1550 1563 1594 1544 1919 1644 37 1539 1715 ...
1,"[1359, 15881, 45]",15,"[40, 1542, 1691, 2449, 1535, 3616, 2206, 1904,...",2,"[1731, 47, 11, 57, 4624, 1525, 1535, 47, 11, 3...",[45],"[1359, 15881]",1731 47 11 57 4624 1525 1535 47 11 3522 2223 1653,40 1542 1691 2449 1535 3616 2206 1904 1642 154...,1731 47 11 57 4624 1525 1535 47 11 3522 2223 1...
2,[97],10,"[46, 1624, 1547, 56, 1687, 1644, 6, 7, 3386, 1...",4,"[40, 1733, 1735, 1540, 1655, 46, 1624, 1547, 5...",[97],[],40 1733 1735 1540 1655 46 1624 1547 56 1687 16...,46 1624 1547 56 1687 1644 6 7 3386 1542 2654 1...,40 1733 1735 1540 1655 46 1624 1547 56 1687 16...
3,"[19617, 2]",10,"[37, 3709, 3836, 1586, 2151, 1727, 3021, 1860,...",9,"[38, 1592, 2088, 1543, 1574, 1727, 1597, 1813,...",[2],[19617],38 1592 2088 1543 1574 1727 1597 1813 1926 152...,37 3709 3836 1586 2151 1727 3021 1860 1527 181...,38 1592 2088 1543 1574 1727 1597 1813 1926 152...
4,"[9641, 44, 5623, 2]",18,"[1731, 2021, 1543, 11, 1546, 11, 1647, 2163, 1...",0,"[1560, 1694, 11, 1546, 11, 3066, 1728, 47, 160...","[44, 2]","[9641, 5623]",1560 1694 11 1546 11 3066 1728 47 1603 1553 11...,1731 2021 1543 11 1546 11 1647 2163 1542 1546 ...,1560 1694 11 1546 11 3066 1728 47 1603 1553 11...


#### 3.1 Present Testing Data

In [11]:
data_test.head(5)

Unnamed: 0,identifier,coauthors,year,abstract,venue,title,title_text,abstract_text,text
0,0,"[16336, 1762, 4357, 12564]",19,"[37, 1662, 3207, 10, 33, 2037, 1738, 1642, 155...",223,"[3207, 24, 1798, 1738, 37, 2375, 1568, 11, 53,...",3207 24 1798 1738 37 2375 1568 11 53 1584 1903...,37 1662 3207 10 33 2037 1738 1642 1553 4917 11...,3207 24 1798 1738 37 2375 1568 11 53 1584 1903...
1,1,"[21189, 14088]",19,"[1731, 2130, 3674, 1705, 1656, 3077, 1546, 367...",223,"[40, 1560, 1536, 1544, 1609, 1705, 1658, 1543,...",40 1560 1536 1544 1609 1705 1658 1543 52 11 33...,1731 2130 3674 1705 1656 3077 1546 3675 2051 2...,40 1560 1536 1544 1609 1705 1658 1543 52 11 33...
2,2,"[3625, 1198, 19889, 794, 2749, 7801]",19,"[1551, 1728, 3920, 1542, 1535, 1656, 1543, 153...",7,"[47, 1574, 1729, 1641, 11, 37, 2533, 2015, 47,...",47 1574 1729 1641 11 37 2533 2015 47 1930 1549...,1551 1728 3920 1542 1535 1656 1543 1530 3053 2...,47 1574 1729 1641 11 37 2533 2015 47 1930 1549...
3,3,"[19810, 15173, 5876, 111]",19,"[51, 1535, 2115, 1543, 1811, 1700, 1657, 1684,...",21,"[1770, 53, 2054, 1549, 1529, 1723, 2796, 1547,...",1770 53 2054 1549 1529 1723 2796 1547 1543 47 ...,51 1535 2115 1543 1811 1700 1657 1684 1549 192...,1770 53 2054 1549 1529 1723 2796 1547 1543 47 ...
4,4,"[10932, 7668, 11907, 19601, 15307, 10492, 1049...",19,"[1775, 1746, 1842, 1525, 33, 2551, 1882, 1542,...",465,"[18, 1924, 23, 1544, 3927, 2686, 1543, 1535, 1...",18 1924 23 1544 3927 2686 1543 1535 1660 1548 ...,1775 1746 1842 1525 33 2551 1882 1542 33 2548 ...,18 1924 23 1544 3927 2686 1543 1535 1660 1548 ...


### 4. Save the Preprocessed Data

#### 4.1 Save the training and testing data as json

In [12]:
data_tran.to_json('data2/data_tran.json', orient='records', lines=True)
data_test.to_json('data2/data_test.json', orient='records', lines=True)

#### 4.2 Save the label in onehot as npy

In [13]:
def process_label(data):
    
    n_author = 101  
    threshold_author = 100  

    author_list = np.zeros((len(data), n_author))

    for i, authors in enumerate(data["authors"]):
        author_id = np.array(authors)[np.array(authors) < threshold_author]
        
        if len(author_id) == 0:
            author_list[i, -1] = 1
        else:
            author_list[i, author_id] = 1

    return author_list

y_tran = process_label(data_tran)

np.save('data2/y_tran.npy', y_tran)