In [1]:
import pandas as pd


#### Read in previous labeled set

In [2]:
# get previous training and testing set 

df_old = pd.read_csv("../../../data/prd/Digital_abstract_labelled/labelled_abstracts.csv", dtype=str)

df_old.shape

(1200, 8)

In [3]:
df_old.head()

Unnamed: 0.1,Unnamed: 0,Duplicates,index,PROJECT_ID,ABSTRACT,Is it related to Big-Data,Expert name,label
0,0,0,979,950056,ï»¿ DESCRIPTION (provided by applicant): Adv...,1,Kristian,Big-data
1,1,0,816,1178172,Project Summary/AbstractThe establishment of t...,1,Kristian,Big-data
2,2,0,1096,163037,DESCRIPTION (provided by applicant): The long-...,0,Kristian,Non Big-data
3,3,0,725,880197,DESCRIPTION (provided by applicant): The most ...,0,Kristian,Non Big-data
4,4,0,92,887331,Transsphenoidal surgery (TSS) is the best trea...,0,Kristian,Non Big-data


In [4]:
df_old = df_old.drop(columns=["Unnamed: 0"])

In [5]:
df_old.head()

Unnamed: 0,Duplicates,index,PROJECT_ID,ABSTRACT,Is it related to Big-Data,Expert name,label
0,0,979,950056,ï»¿ DESCRIPTION (provided by applicant): Adv...,1,Kristian,Big-data
1,0,816,1178172,Project Summary/AbstractThe establishment of t...,1,Kristian,Big-data
2,0,1096,163037,DESCRIPTION (provided by applicant): The long-...,0,Kristian,Non Big-data
3,0,725,880197,DESCRIPTION (provided by applicant): The most ...,0,Kristian,Non Big-data
4,0,92,887331,Transsphenoidal surgery (TSS) is the best trea...,0,Kristian,Non Big-data


In [6]:
max(df_old["index"].astype('int')) # index: between 0-1199

1199

In [7]:
df_old.isnull().sum()  # nothing missing except expert name in 100 rows - OK

Duplicates                     0
index                          0
PROJECT_ID                     0
ABSTRACT                       0
Is it related to Big-Data      0
Expert name                  100
label                          0
dtype: int64

In [8]:
# check labels

df_old["Is it related to Big-Data"].value_counts()

0    711
1    487
o      2
Name: Is it related to Big-Data, dtype: int64

In [9]:
# label mis-labeled abstracts (used o instead of 0)

temp = df_old.loc[df_old["Is it related to Big-Data"] == 'o']

#temp["ABSTRACT"].iloc[1]  # 1st abstract - 0, 2nd abstract - 0

In [10]:
df_old.loc[df_old["Is it related to Big-Data"] == 'o', "Is it related to Big-Data"] = '0'

In [11]:
df_old["Is it related to Big-Data"].value_counts()

0    713
1    487
Name: Is it related to Big-Data, dtype: int64

#### Read in new labeled set and format exactly as old labeled set

In [12]:
df_new = pd.read_excel("new_big_data_abstract_labeling.xlsx", dtype=str)

print(df_new.shape)

(400, 4)


In [13]:
df_new.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,big data (1) or not big data (0),Labeler Name
0,90145,Scientists and supporters of science have long...,1,KJL
1,90761,ENABLING MASSIVE SCIENTIFIC DATABASES THROUGH ...,1,KJL
2,91282,This project addresses an important unsolved p...,0,KJL
3,91610,Intellectual Merit: This project implements P...,0,KJL
4,92217,ANALYSIS OF THE EMPLACEMENT OF THE MARE IMBRIU...,0,KJL


In [14]:
df_new.isnull().sum()  # no abstracts were missed - good

PROJECT_ID                          0
ABSTRACT                            0
big data (1) or not big data (0)    0
Labeler Name                        0
dtype: int64

In [15]:
df_new["big data (1) or not big data (0)"].value_counts()  # all labels good

1    200
0    200
Name: big data (1) or not big data (0), dtype: int64

In [16]:
df_new['Duplicates'] = 0

In [17]:
df_new['index'] = range(0,400,1)

In [18]:
df_new.columns

Index(['PROJECT_ID', 'ABSTRACT', 'big data (1) or not big data (0)',
       'Labeler Name', 'Duplicates', 'index'],
      dtype='object')

In [19]:
df_old.columns

Index(['Duplicates', 'index', 'PROJECT_ID', 'ABSTRACT',
       'Is it related to Big-Data', 'Expert name', 'label'],
      dtype='object')

In [20]:
df_new = df_new.rename(columns={"big data (1) or not big data (0)": "Is it related to Big-Data",
                       "Labeler Name": "Expert name"})

In [21]:
df_new

Unnamed: 0,PROJECT_ID,ABSTRACT,Is it related to Big-Data,Expert name,Duplicates,index
0,90145,Scientists and supporters of science have long...,1,KJL,0,0
1,90761,ENABLING MASSIVE SCIENTIFIC DATABASES THROUGH ...,1,KJL,0,1
2,91282,This project addresses an important unsolved p...,0,KJL,0,2
3,91610,Intellectual Merit: This project implements P...,0,KJL,0,3
4,92217,ANALYSIS OF THE EMPLACEMENT OF THE MARE IMBRIU...,0,KJL,0,4
...,...,...,...,...,...,...
395,1290459,Extracting knowledge from data using statistic...,1,SSS,0,395
396,1290525,As the amount and variety of data available on...,1,SSS,0,396
397,1292937,"By 2030, nearly 146 million connected vehicles...",1,SSS,0,397
398,1293276,High-performance computing (HPC) has enabled s...,1,SSS,0,398


In [22]:
# Add label
df_new['label'] = 'Non Big-data'
df_new.loc[df_new['Is it related to Big-Data']=="1",'label'] = 'Big-data'

In [23]:
df_new.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,Is it related to Big-Data,Expert name,Duplicates,index,label
0,90145,Scientists and supporters of science have long...,1,KJL,0,0,Big-data
1,90761,ENABLING MASSIVE SCIENTIFIC DATABASES THROUGH ...,1,KJL,0,1,Big-data
2,91282,This project addresses an important unsolved p...,0,KJL,0,2,Non Big-data
3,91610,Intellectual Merit: This project implements P...,0,KJL,0,3,Non Big-data
4,92217,ANALYSIS OF THE EMPLACEMENT OF THE MARE IMBRIU...,0,KJL,0,4,Non Big-data


In [24]:
df_new = df_new[['Duplicates', 'index', 'PROJECT_ID', 'ABSTRACT',
                 'Is it related to Big-Data', 'Expert name', 'label']]

In [25]:
df_new.head()

Unnamed: 0,Duplicates,index,PROJECT_ID,ABSTRACT,Is it related to Big-Data,Expert name,label
0,0,0,90145,Scientists and supporters of science have long...,1,KJL,Big-data
1,0,1,90761,ENABLING MASSIVE SCIENTIFIC DATABASES THROUGH ...,1,KJL,Big-data
2,0,2,91282,This project addresses an important unsolved p...,0,KJL,Non Big-data
3,0,3,91610,Intellectual Merit: This project implements P...,0,KJL,Non Big-data
4,0,4,92217,ANALYSIS OF THE EMPLACEMENT OF THE MARE IMBRIU...,0,KJL,Non Big-data


#### Concatenate old and new labeled sets

In [26]:
df = pd.concat([df_old, df_new], ignore_index = True)

In [27]:
df.head()

Unnamed: 0,Duplicates,index,PROJECT_ID,ABSTRACT,Is it related to Big-Data,Expert name,label
0,0,979,950056,ï»¿ DESCRIPTION (provided by applicant): Adv...,1,Kristian,Big-data
1,0,816,1178172,Project Summary/AbstractThe establishment of t...,1,Kristian,Big-data
2,0,1096,163037,DESCRIPTION (provided by applicant): The long-...,0,Kristian,Non Big-data
3,0,725,880197,DESCRIPTION (provided by applicant): The most ...,0,Kristian,Non Big-data
4,0,92,887331,Transsphenoidal surgery (TSS) is the best trea...,0,Kristian,Non Big-data


In [28]:
df.isnull().sum()

Duplicates                     0
index                          0
PROJECT_ID                     0
ABSTRACT                       0
Is it related to Big-Data      0
Expert name                  100
label                          0
dtype: int64

In [29]:
print(df.shape)

(1600, 7)


#### Write new labeled dataset

In [30]:
df.to_csv('../../../data/prd/Digital_abstract_labelled/DEC-2022-labelled_abstracts.csv')