In [4]:
# Importing the Libraries
import spacy
import pandas as pd

In [5]:
# Loading the English Models
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.4.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
# Creating the Doc Object 
text = """
Quantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation. A quantum computer is used to perform such computation, which can be implemented theoretically or physically.
"""
text = text.lower()

doc = nlp(text)

<h2 style="color:blue;">Tokenization</h2>

In [7]:
# Displaying the tokens 
tokens = list()

for token in doc:
    tokens.append(token.text)
    
print(tokens)

['\n', 'quantum', 'computing', 'is', 'the', 'use', 'of', 'quantum', '-', 'mechanical', 'phenomena', 'such', 'as', 'superposition', 'and', 'entanglement', 'to', 'perform', 'computation', '.', 'a', 'quantum', 'computer', 'is', 'used', 'to', 'perform', 'such', 'computation', ',', 'which', 'can', 'be', 'implemented', 'theoretically', 'or', 'physically', '.', '\n']


<h2 style="color:blue;">Sentence Tokenization</h2>

In [8]:
# Now we will be doing Sentence Tokenizatio
sentence_tokens = list()

for sentence_token in doc.sents:
    sentence_tokens.append(sentence_token.text)

print(sentence_tokens)

['\nquantum computing is the use of quantum-mechanical phenomena such as superposition and entanglement to perform computation.', 'a quantum computer is used to perform such computation, which can be implemented theoretically or physically.\n']


<h2 style="color:blue;">Tokenization and Stop Words Removal</h2>

In [9]:
# Filtering stop words and removing the duplicates
filtered_words = list()

for word in doc:
    if word.is_stop==False:
        filtered_words.append(word)
        
filtered_words = list(dict.fromkeys(filtered_words))
print(filtered_words)

[
, quantum, computing, use, quantum, -, mechanical, phenomena, superposition, entanglement, perform, computation, ., quantum, computer, perform, computation, ,, implemented, theoretically, physically, ., 
]


<h2 style="color:blue;">Lemmatization</h2>

In [13]:
# Printing the Lemma for each word
doc = nlp("implemented cooking stopping died played disturbing")

words = list()
lemma_list = list()

for word in doc:
    words.append(word.text)
    lemma_list.append(word.lemma_)

lemma_list

['implement', 'cooking', 'stopping', 'die', 'play', 'disturb']

In [11]:
df = pd.DataFrame({"Word": words,
                  "Lemma": lemma_list})

In [12]:
df

Unnamed: 0,Word,Lemma
0,implemented,implement
1,cooking,cooking
2,stopping,stopping
3,died,die
4,played,play
5,disturbing,disturb
