# Lemmatizing working abstracts

The text has already been cleaned.  

**Note**: There is an important change: we switch lemmatizers to spaCy's (instead of using stanza).  spaCy lemmatizer is much faster and in my tests I have found the lemmas from the two packages to be almost exactly the same.  spaCy lemmatizer requires POS tagger.  We tokenize, POS, lemmatize, and remove stop words in same function.  

In [1]:
import pandas as pd
import pickle
import numpy as np
import time
import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import sys

pd.set_option('display.max_columns', 50)

In [2]:
# only run one time to download spacy model 

#!{sys.executable} -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### Read in Dataset 

In [3]:
# load saved df.  df['working_abstract'] contains cleaned text.

df = pd.read_pickle("../../../data/prd/Paper/FR_clean_22DEC21.pkl")
df.reset_index(inplace = True)
df.rename(columns={'index':'original index'}, inplace=True)

In [4]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,PROJECT_TERMS,PROJECT_TITLE,DEPARTMENT,AGENCY,IC_CENTER,PROJECT_NUMBER,PROJECT_START_DATE,PROJECT_END_DATE,CONTACT_PI_PROJECT_LEADER,OTHER_PIS,CONGRESSIONAL_DISTRICT,DUNS_NUMBER,ORGANIZATION_NAME,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS,ORG_COUNT,PI_COUNT,FY_TOTAL_COST_SUM,NUM_RECORDS,working_abstract,nchar,Start_Char
0,0,89996,"This is a project to explore Game-based, Metap...",Achievement; analog; base; Cognitive Science; ...,RUI: CYGAMES: CYBER-ENABLED TEACHING AND LEARN...,NSF,NSF,,814512,9/15/2008,8/31/2012,"REESE, DEBBIE D","CARTER, BEVERLY; WOOD, CHARLES; HITT, BEN",1,68719400,WHEELING JESUIT UNIVERSITY,WHEELING,WV,26003-6243,UNITED STATES,,,47.076,2008,1999467.0,,1,1,1999467.0,1,"This is a project to explore Game-based, Metap...",2057,T
1,1,89997,Institution: Franklin Institute Science Museum...,Active Learning; Child; Computer software; des...,ARIEL - AUGMENTED REALITY FOR INTERPRETIVE AND...,NSF,NSF,,741659,9/15/2008,8/31/2012,"SNYDER, STEVEN","ELINICH, KAREN; YOON, SUSAN",2,1741859,FRANKLIN INSTITUTE,PHILADELPHIA,PA,19103-1115,UNITED STATES,,,47.076,2008,1799699.0,,1,1,1799699.0,1,Institution: Franklin Institute Science Museum...,2053,I
2,2,89998,Through programs (including small group conver...,Address; Age; Birth; Brain; Caregivers; Child;...,BRIGHTER FUTURES: PUBLIC DELIBERATION ABOUT TH...,NSF,NSF,,813522,9/15/2008,8/31/2011,"FINK, LAURIE KLEINBAUM","CADIGAN, KAREN; ELLENBOGEN, KIRSTEN",4,61451670,SCIENCE MUSEUM OF MINNESOTA,SAINT PAUL,MN,55102-1202,UNITED STATES,,,47.076,2008,1505858.0,,1,1,1505858.0,1,Through programs (including small group conver...,1154,T
3,3,89999,In partnership with the American Chemical Soci...,Advanced Development; American; Chemicals; Che...,FOSTERING US-INTERNATIONAL COLLABORATIVE PARTN...,NSF,NSF,,838627,8/1/2008,12/31/2010,"JOST, JOHN W","MILLER, BRADLEY; BOWMAN, KATHERINE",4,9059242,INTERNATIONAL UNION OF PURE AND APPLIED CHEMISTRY,DURHAM,NC,27709-3757,UNITED STATES,,,47.049,2008,51000.0,,1,1,51000.0,1,In partnership with the American Chemical Soci...,875,I
4,5,90001,The Center for Molecular Interfacing (CMI) wil...,Address; Architecture; Carbon Nanotubes; Catal...,CCI PHASE I: CENTER FOR MOLECULAR INTERFACING,NSF,NSF,,847926,10/1/2008,9/30/2011,"ABRUNA, HECTOR D",,22,872612445,CORNELL UNIVERSITY ITHACA,ITHACA,NY,14850-2820,UNITED STATES,,,47.049,2008,1519821.0,,1,1,1519821.0,1,The Center for Molecular Interfacing (CMI) wil...,2309,T


In [5]:
df.shape

(1143904, 33)

### Tokenize, find POS, Lemmatize, and remove stop words   

**Actual code excuted in 02-lemmatize.py so it could be ported as a SLURM script.**

In [6]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [7]:
sentences = df['working_abstract'][:1000]

t1 = time.time()
lemmas = []

for doc in nlp.pipe(sentences):
    tokens = []
    for token in doc:
        if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN', 'INTJ', 'NUM', 'X'] and not token.is_stop:
            tokens.append(token.lemma_)
    lemmas.append(tokens)

t2 = time.time()
print(t2-t1)

13.458257913589478


In [8]:
(len(df)/1000)*13/60/60


4.130764444444444

In [10]:
print(lemmas[4])

['Center', 'Molecular', 'Interfacing', 'CMI', 'enable', 'integration', 'control', 'molecular', 'constituent', 'macroscopic', 'system', 'graphene', 'sheet', 'carbon', 'nanotube', 'CNTs', 'achieve', 'molecularly', 'define', 'reproducible', 'robust', 'connection', 'interdisciplinary', 'inter', '-', 'institutional', 'team', 'researcher', '1', 'study', 'electrical', 'opto', 'electronic', 'property', 'graphene', 'molecule', 'graphene', 'cnt', 'molecule', 'cnt', 'device', 'mechanical', 'adjustability', 'electrolytic', 'gating', 'optical', 'access', '2', 'use', 'AFM', 'STM', 'characterize', 'molecule', 'graphene', 'interface', '3', 'use', 'advanced', 'laser', 'microscopy', 'identify', 'excite', 'individual', 'electrically', 'contact', 'molecule', 'work', 'enable', 'development', 'novel', 'experimental', 'platform', 'technique', 'synthesis', 'molecular', 'architecture', 'deliberate', 'design', 'function', 'development', 'theoretical', 'framework', 'fundamental', 'chemical', 'process', 'self', '

In [11]:
file = open("../../../data/prd/Paper/FR_lemmas_22DEC21.pkl", 'wb')
#pickle.dump(lemmas, file)
file.close()

In [2]:
# check if it worked

file = open("../../../data/prd/Paper/FR_lemmas_22DEC21.pkl",'rb')
lem = pickle.load(file)
file.close()

In [15]:
lemmas == lem

True

In [4]:
lem[0:2]

[['project',
  'explore',
  'game',
  'base',
  'Metaphor',
  'Enhanced',
  'GaME',
  'design',
  'game',
  'method',
  'apply',
  'cognitive',
  'science',
  'metaphor',
  'theory',
  'design',
  'computer',
  'mediate',
  'learning',
  'environment',
  'process',
  'use',
  'structure',
  'mapping',
  'theory',
  'design',
  'videogame',
  'world',
  'align',
  'science',
  'concept',
  'rigorous',
  'specification',
  'procedure',
  'map',
  'relational',
  'structure',
  'targeted',
  'concept',
  'game',
  'world',
  'game',
  'design',
  'translate',
  'target',
  'concept',
  'game',
  'system',
  'game',
  'play',
  'game',
  'goal',
  'relational',
  'structure',
  'game',
  'world',
  'design',
  'analog',
  'targeted',
  'conceptual',
  'domain',
  'player',
  'begin',
  'construct',
  'mental',
  'model',
  'target',
  'concept',
  'interactive',
  'gameplay',
  'make',
  'learn',
  'concrete',
  'embody',
  'gameplay',
  'experience',
  'design',
  'guide',
  'learner',
  

In [2]:
STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [4]:
'use' in STOP_WORDS

False