# Tokenization

In [43]:
# !python -m spacy download en_core_web_sm
# %pip install spacy nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     - -------------------------------------- 0.0/1.5 MB 991.0 kB/s eta 0:00:02
     -- ------------------------------------- 0.1/1.5 MB 1.5 MB/s eta 0:00:01
     ---- ----------------------------------- 0.2/1.5 MB 1.5 MB/s eta 0:00:01
     ---- ----------------------------------- 0.2/1.5 MB 1.5 MB/s eta 0:00:01
     ------- -------------------------------- 0.3/1.5 MB 1.4 MB/s eta 0:00:01
     ---------- ----------------------------- 0.4/1.5 MB 1.5 MB/s eta 0:00:01
     ------------ --------------------------- 0.5/1.5 MB 1.5 MB/s eta 0:00:01
     ------------ --------------------------- 0.5/1.5 MB 1.5 MB/s eta 0:00:01
     ----------------- ---------------------- 0.6/1.5 MB 1.6 MB/s eta 0:00:01
     ------------------ --------------------- 0.7/1.5 MB 1.5 MB/s eta 0:00:01
     --------------------- ------------------ 0.8/1.5 MB 1.6 MB/s eta 0:00:01
  

In [31]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")


In [25]:
strings = [
    "Apple to bring a hong Kong factory for $8 million",
    "Autonomous cars shift insurance liability towards manufactures"
]

In [28]:
doc = nlp(strings[0])
print("sentence: " , strings[0])

for token in doc:
    print("\t", token.text, end= "|")
print("\t\tEntities")
for entity in doc.ents:
    print(f"\t<{entity}, {entity.label_}>", spacy.explain(entity.label_))

sentence:  Apple to bring a hong Kong factory for $8 million
	 Apple|	 to|	 bring|	 a|	 hong|	 Kong|	 factory|	 for|	 $|	 8|	 million|		Entities
	<Apple, ORG> Companies, agencies, institutions, etc.
	<hong Kong, GPE> Countries, cities, states
	<$8 million, MONEY> Monetary values, including unit


In [23]:
for entity in doc.ents:
    print(f"<{entity}, {entity.label_}>", spacy.explain(entity.label_))

<Apple, ORG> Companies, agencies, institutions, etc.
<hong Kong, GPE> Countries, cities, states
<$8 million, MONEY> Monetary values, including unit


In [30]:
doc = nlp(strings[1])
print(doc)
for chunk in doc.noun_chunks:
    print(chunk, end = "|")

Autonomous cars shift insurance liability towards manufactures
Autonomous cars|insurance liability|manufactures|

In [40]:
displacy.render(doc, style="dep",jupyter = True, options={"distance":90})

In [41]:
doc = nlp("Over the last quater apple solud nearly 40 thousands ipods for a profit of $60 million")

In [42]:
displacy.render(doc, style = 'ent', jupyter=True)

# Stemming

In [50]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [53]:
p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language="english")

In [57]:
words = [
    "run", 
    "runner",
    "ran",
    "runs",
    "easily",
    "fairly",
    "fairness"
]

In [58]:
for word in words:
    print(word + "-->" + p_stemmer.stem(word)+ "||" + s_stemmer.stem(word))

run-->run||run
runner-->runner||runner
ran-->ran||ran
runs-->run||run
easily-->easili||easili
fairly-->fairli||fair
fairness-->fair||fair


# Lemmantization

In [62]:
doc = nlp("I am a runner running in a race because I love to run since I ran today" )


In [63]:
for token in doc:
    print(f"{token.text}\t{token.pos_}\t{token.lemma}\t{token.lemma_}")

I	PRON	4690420944186131903	I
am	AUX	10382539506755952630	be
a	DET	11901859001352538922	a
runner	NOUN	12640964157389618806	runner
running	VERB	12767647472892411841	run
in	ADP	3002984154512732771	in
a	DET	11901859001352538922	a
race	NOUN	8048469955494714898	race
because	SCONJ	16950148841647037698	because
I	PRON	4690420944186131903	I
love	VERB	3702023516439754181	love
to	PART	3791531372978436496	to
run	VERB	12767647472892411841	run
since	SCONJ	10066841407251338481	since
I	PRON	4690420944186131903	I
ran	VERB	12767647472892411841	run
today	NOUN	11042482332948150395	today


# Phrase matching and pattern matching  
[Examples](https://spacy.pythonhumanities.com/02_02_matcher.html)

In [64]:
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

pattern = [
    [{"LIKE_EMAIL": True}], 
    [{"POS": "PROPN"}]
    ]
matcher.add("My_Pattern",pattern)

text = "You can contact Data Science Learner through email address contact@datasciencelearner.com"
doc = nlp(text)
matches = matcher(doc)

for match_id,start,end in matches:
    print(doc[start:end])

Data
Science
Learner
contact@datasciencelearner.com


# Word Normalization

In [65]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

In [66]:

# Download the NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vk001\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vk001\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vk001\AppData\Roaming\nltk_data...


True

In [67]:

# Define the text to be normalized
text = "The quick brown fox, jumps over the lazy dog!"
print(text)
# Convert all characters to lowercase
text = text.lower()
print(text)
# Remove any punctuation marks
text = re.sub(r'[^\w\s]', '', text)
print(text)

The quick brown fox, jumps over the lazy dog!
the quick brown fox, jumps over the lazy dog!
the quick brown fox jumps over the lazy dog


In [68]:

# Perform stemming
stemmer = PorterStemmer()
tokens = nltk.word_tokenize(text)
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print("stemmed tokens \n", stemmed_tokens)

stemmed tokens 
 ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog']


In [69]:

# Perform lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print(lemmatized_tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']


In [70]:

# Remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in lemmatized_tokens if not token in stop_words]

print(filtered_tokens)


['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']
