## NLTK

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer

In [2]:
lemmatizer = WordNetLemmatizer()

In [3]:
print(lemmatizer.lemmatize("more"))

more


In [4]:
# Define the sentence to be lemmatized
sentence = "I ran on Mars."

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)

# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)

['I', 'ran', 'on', 'Mars', '.']
I ran on Mars .


In [5]:
print(nltk.pos_tag(['jumped']))

[('jumped', 'NN')]


In [23]:
sentence = "I jumped-on Mars."

print(nltk.pos_tag(nltk.word_tokenize(sentence)))

[('I', 'PRP'), ('jumped-on', 'JJ'), ('Mars', 'NNP'), ('.', '.')]


In [8]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [9]:
word = 'jumped'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

jumped


In [15]:
get_wordnet_pos('run')

'v'

In [25]:
sentence = "I have plants"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

['I', 'have', 'plant']


## STANZA

In [2]:
import stanza

In [3]:
nlp = stanza.Pipeline(lang='en',processors='tokenize,pos,lemma',tokenize_batch_size=500,lemma_batch_size=500,
                      use_gpu = False)

2021-12-17 22:26:56 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2021-12-17 22:26:56 INFO: Use device: cpu
2021-12-17 22:26:56 INFO: Loading: tokenize
2021-12-17 22:26:56 INFO: Loading: pos
2021-12-17 22:26:58 INFO: Loading: lemma
2021-12-17 22:26:58 INFO: Done loading processors!


In [4]:
def token_pos_lemma(doc, pretokened=False, keep_numbers=True):
    
    # This function uses the pipeline to tokenize, find POS, and lemmatize a document
    
    """if pretokened, dont use this function, as it hasnt been adapted for it"""
    
    assert not pretokened #If these are already tokened per another pipeline, this function won't work correctly
    
    new_tokens=[]
     
    processed=nlp(doc)  # this is the line that does the tokenizing, pos, and lemmatizing
    
    for sent in processed.sentences:
        for word in sent.words:
            
            #If its a regular noun, verb, adj, or adverb, keep lemmatized form
            if word.pos in ['NOUN','VERB','ADJ','ADV']:
                new_tokens.append(word.lemma)
            
            #If you decided to retain numbers, their lemma is kept here. 
            #Note that number catching isnt perfect by this lemmatizing.
            elif word.pos=='NUM' and keep_numbers:
                new_tokens.append(word.lemma)
            
            #Exact phrases are kept here with no attempt at lemmatization: e.g. mars does not become mars, 
            #and hopefully scientific words e.g. chemicals will be tagged as propn, x, or intj if needed
            elif word.pos in ['PROPN','X','INTJ']: 
                new_tokens.append(word.text)
            
            #Note that no other tokens are kept        
       
    return new_tokens

In [43]:
#token_pos_lemma("I am jumping on Mars.")
wds = token_pos_lemma("There is a technology trend to consolidate multiple applications onto a shared hardware platform to reduce the size, weight, power, and cost of real-time systems, such as self-driving vehicles and autonomous robots. Furthermore, modern platforms consist of Central Processing Units (CPUs) and Graphics Processing Units (GPUs) with an increasing number of processing cores that share resources.  Moreover, many current applications, such as artificial intelligence applications, have high computation needs and must execute in parallel to satisfy their real-time constraints. These technology trends demand that real-time systems be able to schedule real-time applications upon the shared multiple parallel resources efficiently.This research will investigate new parallel real-time scheduling frameworks for modern platforms with multiple resources. The scheduling problem is classified into two categories: staged-resources scheduling for alternating usage of different types of resources (e.g., alternatively executing on CPUs and GPUs), and vectorized-resources scheduling for simultaneously using multiple types of resources (e.g., running on processing units that share the last-level cache). The project will establish new parallel real-time task models for the two categories of resource usages. Based on the models, novel real-time schedulers and their corresponding analyses will be developed to achieve the goal of efficient utilization of multiple resources. The project will advance the understanding of parallel scheduling in real-time systems and serves as the initial steps of the challenge of efficient parallel real-time systems upon powerful and complex modern platforms. This project can have industrial impact on a wide range of today's artificial intelligence-based real-time systems to improve their responsiveness, efficiency, and scalability. The project includes enriching outreach activities and diversity programs to promote Science, Technology, Engineering and Mathematics (STEM) educational activity and broaden participation in computing and engineering.  Research products generated as part of this project will be retained, managed, and disseminated through resources available at the New Jersey Institute of Technology. The products will be preserved with the goal of storing them for at least three years after the completion of the project or the publication of the corresponding articles, whichever is later. The URL to the project repository is https://git.njit.edu/njit-prt.This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria.")

In [44]:
print(wds)

['be', 'technology', 'trend', 'consolidate', 'multiple', 'application', 'share', 'hardware', 'platform', 'reduce', 'size', 'weight', 'power', 'cost', 'real', 'time', 'system', 'such', 'self', 'driving', 'vehicle', 'autonomous', 'robot', 'furthermore', 'modern', 'platform', 'consist', 'Central', 'processing', 'unit', 'cpus', 'graphic', 'processing', 'unit', 'gpus', 'increase', 'number', 'processing', 'core', 'share', 'resource', 'moreover', 'many', 'current', 'application', 'such', 'artificial', 'intelligence', 'application', 'have', 'high', 'computation', 'need', 'execute', 'parallel', 'satisfy', 'real', 'time', 'constraint', 'technology', 'trend', 'demand', 'real', 'time', 'system', 'able', 'schedule', 'real', 'time', 'application', 'share', 'multiple', 'parallel', 'resource', 'efficiently', 'research', 'investigate', 'new', 'parallel', 'real', 'time', 'scheduling', 'framework', 'modern', 'platform', 'multiple', 'resource', 'scheduling', 'problem', 'classify', 'two', 'category', 'stag

## SPACY

In [1]:
import spacy

In [32]:
import sys
!{sys.executable} -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp2 = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [45]:
#sentence = "I jumped on Mars."
sentence = "There is a technology trend to consolidate multiple applications onto a shared hardware platform to reduce the size, weight, power, and cost of real-time systems, such as self-driving vehicles and autonomous robots. Furthermore, modern platforms consist of Central Processing Units (CPUs) and Graphics Processing Units (GPUs) with an increasing number of processing cores that share resources.  Moreover, many current applications, such as artificial intelligence applications, have high computation needs and must execute in parallel to satisfy their real-time constraints. These technology trends demand that real-time systems be able to schedule real-time applications upon the shared multiple parallel resources efficiently.This research will investigate new parallel real-time scheduling frameworks for modern platforms with multiple resources. The scheduling problem is classified into two categories: staged-resources scheduling for alternating usage of different types of resources (e.g., alternatively executing on CPUs and GPUs), and vectorized-resources scheduling for simultaneously using multiple types of resources (e.g., running on processing units that share the last-level cache). The project will establish new parallel real-time task models for the two categories of resource usages. Based on the models, novel real-time schedulers and their corresponding analyses will be developed to achieve the goal of efficient utilization of multiple resources. The project will advance the understanding of parallel scheduling in real-time systems and serves as the initial steps of the challenge of efficient parallel real-time systems upon powerful and complex modern platforms. This project can have industrial impact on a wide range of today's artificial intelligence-based real-time systems to improve their responsiveness, efficiency, and scalability. The project includes enriching outreach activities and diversity programs to promote Science, Technology, Engineering and Mathematics (STEM) educational activity and broaden participation in computing and engineering.  Research products generated as part of this project will be retained, managed, and disseminated through resources available at the New Jersey Institute of Technology. The products will be preserved with the goal of storing them for at least three years after the completion of the project or the publication of the corresponding articles, whichever is later. The URL to the project repository is https://git.njit.edu/njit-prt.This award reflects NSF's statutory mission and has been deemed worthy of support through evaluation using the Foundation's intellectual merit and broader impacts review criteria."

# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp2(sentence)

tokens = []

for token in doc:
    if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'PROPN', 'INTJ', 'NUM', 'X'] and not token.is_stop:
        tokens.append(token.lemma_)

print(tokens)

['technology', 'trend', 'consolidate', 'multiple', 'application', 'share', 'hardware', 'platform', 'reduce', 'size', 'weight', 'power', 'cost', 'real', 'time', 'system', 'self', 'drive', 'vehicle', 'autonomous', 'robot', 'furthermore', 'modern', 'platform', 'consist', 'Central', 'Processing', 'Units', 'CPUs', 'Graphics', 'Processing', 'Units', 'gpu', 'increase', 'number', 'processing', 'core', 'share', 'resource', 'current', 'application', 'artificial', 'intelligence', 'application', 'high', 'computation', 'need', 'execute', 'parallel', 'satisfy', 'real', 'time', 'constraint', 'technology', 'trend', 'demand', 'real', 'time', 'system', 'able', 'schedule', 'real', 'time', 'application', 'share', 'multiple', 'parallel', 'resource', 'efficiently', 'research', 'investigate', 'new', 'parallel', 'real', 'time', 'scheduling', 'framework', 'modern', 'platform', 'multiple', 'resource', 'scheduling', 'problem', 'classify', 'category', 'stage', 'resource', 'scheduling', 'alternate', 'usage', 'diff

In [46]:
print(wds)

['be', 'technology', 'trend', 'consolidate', 'multiple', 'application', 'share', 'hardware', 'platform', 'reduce', 'size', 'weight', 'power', 'cost', 'real', 'time', 'system', 'such', 'self', 'driving', 'vehicle', 'autonomous', 'robot', 'furthermore', 'modern', 'platform', 'consist', 'Central', 'processing', 'unit', 'cpus', 'graphic', 'processing', 'unit', 'gpus', 'increase', 'number', 'processing', 'core', 'share', 'resource', 'moreover', 'many', 'current', 'application', 'such', 'artificial', 'intelligence', 'application', 'have', 'high', 'computation', 'need', 'execute', 'parallel', 'satisfy', 'real', 'time', 'constraint', 'technology', 'trend', 'demand', 'real', 'time', 'system', 'able', 'schedule', 'real', 'time', 'application', 'share', 'multiple', 'parallel', 'resource', 'efficiently', 'research', 'investigate', 'new', 'parallel', 'real', 'time', 'scheduling', 'framework', 'modern', 'platform', 'multiple', 'resource', 'scheduling', 'problem', 'classify', 'two', 'category', 'stag

In [3]:
doc = nlp2("data")

In [4]:
len(doc)

1

In [5]:
i = 0
print(doc[i].text)
print(doc[i].lemma_)
print(doc[i].pos_)
print(doc[i].tag_)
print(doc[i].dep_)
print(doc[i].shape_)
print(doc[i].is_alpha)
print(doc[i].is_stop)

data
datum
NOUN
NNS

xxxx
True
False


In [77]:
'VERB' in ['NOUN', 'VERB', 'ADJ']

True

In [47]:
set(tokens) - set(wds)

{'CPUs',
 'Engineering',
 'GPUs',
 'Graphics',
 'Processing',
 'STEM',
 'Science',
 'Units',
 'broaden',
 'cpu',
 'drive',
 'gpu'}

In [48]:
set(wds) - set(tokens)

{'Research',
 'at',
 'be',
 'broader',
 'cpus',
 'driving',
 'gpus',
 'graphic',
 'have',
 'last',
 'least',
 'many',
 'moreover',
 'part',
 'science',
 'stem',
 'such',
 'three',
 'two',
 'use'}

In [64]:
from spacy.lang.en.stop_words import STOP_WORDS

In [65]:
len(STOP_WORDS)

326

In [68]:
print(STOP_WORDS)

{'beforehand', 'four', 'all', 'indeed', 'from', 'very', '’ll', '’ve', '’re', 'wherever', 'another', 'i', 'then', 'alone', 'already', 'take', 'none', 'whether', 'whole', 'elsewhere', 'he', 'beside', 'somewhere', 'its', 'nothing', 'at', 'around', 'eight', 'their', 'latterly', 'down', 'during', 'go', 'only', 'nor', 'did', 'of', 'being', 'becoming', 'never', 'fifteen', 'per', 'together', 'ca', 'above', 'say', 'neither', 'various', 'too', 'seems', 'should', 'when', 'yet', 'really', 'six', 'via', 'you', 'somehow', 'itself', 'formerly', 'used', 'her', 'still', 'much', 'whither', 'your', 'if', 'move', 'whose', 'serious', 'bottom', '‘ll', 'most', 'towards', 'every', 'afterwards', 'may', "n't", 'before', 'just', 'hereupon', 'hereafter', 'each', 'out', 'us', 'twenty', 'thus', 'either', 'moreover', 'also', 'everyone', 'now', 'seem', 'along', 'has', '‘m', 'be', 'see', 'so', 'off', 'his', 'or', 'these', 'here', 'the', 'any', 'two', 'though', 'empty', 'wherein', 'full', 'behind', 'almost', 'although'

In [None]:
# TAKE AWAY - USE SPACY!!  

# will tokenize, pos, lemmatize, remove stop words all in one.