#### Create a CSV for the additional training set

In [1]:
import pandas as pd
import random

In [2]:
# pull in data
df = pd.read_pickle("../../../data/prd/Paper/FR_meta_and_final_tokens_23DEC21.pkl")
df.reset_index(inplace = True, drop = True)

print(df.shape)

(1143869, 30)


In [3]:
# pull in training set addition IDs

ids = pd.read_csv("training_addition_ids.csv", dtype=str)

In [4]:
ids

Unnamed: 0,PROJECT_ID
0,90145
1,90761
2,91610
3,92672
4,94151
...,...
406,1290525
407,1292937
408,1293276
409,1293348


In [5]:
# filter df to the 411 ids from the Wiki method (top 0.1%, no "big data" term) 

train_add_df = df[ df['PROJECT_ID'].isin(list(ids["PROJECT_ID"])) ]

In [6]:
train_add_df.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,...,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,FY_TOTAL_COST_SUM,NUM_RECORDS,final_tokens
373,90145,Scientists and supporters of science have long...,Address; American Indians; Biological Models; ...,"COLLABORATIVE RESEARCH: AOC: MONITORING, MODEL...",NSF,NSF,,0827333,10/1/2008,9/30/2011,...,,47.075,2008,193905.0,,3,3,1220851.0,3,scientist supporter science long work improve ...
672,90761,ENABLING MASSIVE SCIENTIFIC DATABASES THROUGH ...,Data Set; Databases; design; simulation,''ENABLING MASSIVE SCIENTIFIC DATABASES THROUG...,NASA,NASA,,NNX08AY72G,8/1/2008,1/21/2011,...,,43.001,2008,159359.0,,1,1,159359.0,1,enable massive scientific database automated s...
1681,91610,Intellectual Merit: This project implements P...,Archives; Climate; Communities; Data; Data Ana...,"SEDDB, THE ONLINE INFORMATION SYSTEM FOR SEDIM...",NSF,NSF,,0826197,9/1/2008,8/31/2011,...,,47.05,2008,399123.0,,1,1,399123.0,1,intellectual_merit project implement phase ii ...
2651,92672,This project seeks to develop incremental proc...,Address; base; Communities; Complex; computeri...,"CSR-DMSS, SM: INCREMENTAL WEB-SCALE DATA PROCE...",NSF,NSF,,0834784,9/1/2008,8/31/2009,...,,47.07,2008,100000.0,,1,1,100000.0,1,project seek develop incremental processing ab...
3987,94151,Commercial and government entities now spend a...,Algorithms; Archives; Benchmarking; Computer s...,III-COR-MEDIUM: DESIGN AND IMPLEMENTATION OF T...,NSF,NSF,,0803511,8/1/2008,8/31/2010,...,,47.07,2008,177401.0,,1,1,177401.0,1,commercial government entity spend billion per...


In [34]:
# randomly sample 200 project ids of train_add_df

random.seed(5)
rand_pos_ids = random.sample(list(train_add_df["PROJECT_ID"]), 200)

In [35]:
rand_pos_ids[:10]

['1096641',
 '466894',
 '1285271',
 '797163',
 '1277212',
 '1285232',
 '1154155',
 '997509',
 '102767',
 '903694']

In [26]:
# randomly sample 200 project ids of df-train_add_df (to keep balance in training set) 

not_train_add_df = df[ ~df['PROJECT_ID'].isin(list(ids["PROJECT_ID"])) ]
print(len(not_train_add_df))

1143458


In [36]:
rand_neg_ids = random.sample(list(not_train_add_df["PROJECT_ID"]), 200)

In [37]:
rand_neg_ids[:10]

['848475',
 '253605',
 '735042',
 '865820',
 '647200',
 '1138159',
 '610177',
 '678392',
 '891616',
 '461168']

In [38]:
# create union of random ids

new_ids = rand_pos_ids + rand_neg_ids

In [39]:
# double check
len(new_ids)

400

In [14]:
# check for duplicates already in training and testing set

# pull in previous training and testing IDs

prev_train_ids = pd.read_csv("../../../data/prd/Big_Data/projectid_training.csv", dtype=str)
prev_test_ids = pd.read_csv("../../../data/prd/Big_Data/projectid_testing.csv", dtype=str)

In [15]:
prev_ids = list(prev_train_ids["PROJECT_ID"]) + list(prev_test_ids["PROJECT_ID"])

In [16]:
len(prev_ids)

1200

In [40]:
set(new_ids).intersection(set(prev_ids))

# none of the new_ids were already labeled in the previous training set

set()

In [18]:
temp = df[df["PROJECT_ID"] == "1079688"]

In [23]:
temp["ABSTRACT"].iloc[0]

'PROJECT\xa0\xa0SUMMARY\xa0There\xa0 are\xa0 limited\xa0 formal\xa0 opportunities\xa0 for\xa0 biomedical\xa0 and\xa0 health\xa0 science\xa0 trainees\xa0 to\xa0 acquire\xa0 essential\xa0 data\xa0science\xa0 skills.\xa0 Along\xa0 with\xa0 the\xa0 growth\xa0 in\xa0 biomedical\xa0 and\xa0 health\xa0 data,\xa0 there\xa0 will\xa0 be\xa0 a\xa0 need\xa0 for\xa0 researchers\xa0 to\xa0develop\xa0 approaches\xa0 for\xa0 leveraging\xa0 them\xa0 to\xa0discover\xa0and\xa0validate\xa0hypotheses.\xa0The\xa0Training\xa0and\xa0Teaching\xa0for\xa0Transforming\xa0 Big\xa0 Data\xa0 to\xa0 Knowledge\xa0 (T3BD2K)\xa0 Initiative\xa0 will\xa0 address\xa0 these\xa0 needs\xa0 by:\xa0 (1)\xa0 Developing\xa0 a\xa0ten\xadweek\xa0 short\xa0 course\xa0 to\xa0 teach\xa0 pragmatic\xa0 data\xa0 science\xa0 skills;\xa0 and\xa0 (2)\xa0 Coordinating\xa0 data\xa0 science\xa0 education\xa0across\xa0 training\xa0 programs\xa0 in\xa0 Rhode\xa0 Island.\xa0 The\xa0 T3BD2K\xa0Initiative\xa0will\xa0directly\xa0impact\xa0all\xa0cur

In [41]:
# create CSV of new_id abstracts

new_train_df = df[ df["PROJECT_ID"].isin(new_ids) ]
len(new_train_df)

400

In [42]:
new_train_df.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,...,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,FY_TOTAL_COST_SUM,NUM_RECORDS,final_tokens
373,90145,Scientists and supporters of science have long...,Address; American Indians; Biological Models; ...,"COLLABORATIVE RESEARCH: AOC: MONITORING, MODEL...",NSF,NSF,,0827333,10/1/2008,9/30/2011,...,,47.075,2008,193905.0,,3,3,1220851.0,3,scientist supporter science long work improve ...
672,90761,ENABLING MASSIVE SCIENTIFIC DATABASES THROUGH ...,Data Set; Databases; design; simulation,''ENABLING MASSIVE SCIENTIFIC DATABASES THROUG...,NASA,NASA,,NNX08AY72G,8/1/2008,1/21/2011,...,,43.001,2008,159359.0,,1,1,159359.0,1,enable massive scientific database automated s...
1387,91282,This project addresses an important unsolved p...,Address; base; Characteristics; design; Develo...,FINAL MASS LOSS ON THE AGB AND PLANETARY NEBUL...,NSF,NSF,,0806910,8/15/2008,7/31/2012,...,,47.049,2008,256305.0,,1,1,256305.0,1,project address important unsolved problem evo...
1681,91610,Intellectual Merit: This project implements P...,Archives; Climate; Communities; Data; Data Ana...,"SEDDB, THE ONLINE INFORMATION SYSTEM FOR SEDIM...",NSF,NSF,,0826197,9/1/2008,8/31/2011,...,,47.05,2008,399123.0,,1,1,399123.0,1,intellectual_merit project implement phase ii ...
1990,92217,ANALYSIS OF THE EMPLACEMENT OF THE MARE IMBRIU...,Investigation; Modeling; Moon,''ANALYSIS OF THE EMPLACEMENT OF THE MARE IMBR...,NASA,NASA,,NNX08AY98G,11/1/2008,4/25/2011,...,,43.001,2008,32499.0,,1,1,32499.0,1,analysis emplacement mare imbrium lava_flow mo...


In [43]:
csv_df = new_train_df[["PROJECT_ID", "ABSTRACT"]]

In [44]:
csv_df["big data (1) or not big data (0)"] = ""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [45]:
csv_df["Labeler Name"] = ""

In [46]:
csv_df.head()

Unnamed: 0,PROJECT_ID,ABSTRACT,big data (1) or not big data (0),Labeler Name
373,90145,Scientists and supporters of science have long...,,
672,90761,ENABLING MASSIVE SCIENTIFIC DATABASES THROUGH ...,,
1387,91282,This project addresses an important unsolved p...,,
1681,91610,Intellectual Merit: This project implements P...,,
1990,92217,ANALYSIS OF THE EMPLACEMENT OF THE MARE IMBRIU...,,


In [47]:
csv_df.to_csv("new_big_data_abstract_labeling.csv", index=False)

In [48]:
# save new_ids for training

type(new_ids)

list

In [51]:
new_ids_df = pd.DataFrame(new_ids, columns=["PROJECT_ID"])

In [52]:
new_ids_df.head()

Unnamed: 0,PROJECT_ID
0,1096641
1,466894
2,1285271
3,797163
4,1277212


In [53]:
new_ids_df.to_csv("additional_ids.csv", index=False)