This cell helps us with some SSL issues with the nltk library.

In [17]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Tokenization

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
text = "Hello! This is an example sentence. Let's see how it's tokenized."

# Sentence Tokenization
sentences = sent_tokenize(text)
print("Sentences:", sentences)

# Word Tokenization
words = word_tokenize(text)
print("Words:", words)

Sentences: ['Hello!', 'This is an example sentence.', "Let's see how it's tokenized."]
Words: ['Hello', '!', 'This', 'is', 'an', 'example', 'sentence', '.', 'Let', "'s", 'see', 'how', 'it', "'s", 'tokenized', '.']


## Stop Words Removal

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))

text = "Wow! This is an example showing off stop word filtration, removing irrelevant words 1 by 1."
word_tokens = word_tokenize(text)

# Remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in word_tokens]

# Optionally, filter out remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

filtered_sentence = [w.lower() for w in words if not w.lower() in stop_words]


print(words)

print("Filtered Sentence:", filtered_sentence)

['Wow', 'This', 'is', 'an', 'example', 'showing', 'off', 'stop', 'word', 'filtration', 'removing', 'irrelevant', 'words', 'by']
Filtered Sentence: ['wow', 'example', 'showing', 'stop', 'word', 'filtration', 'removing', 'irrelevant', 'words']


## Stemming and Lemmatization

In [3]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

word = "are"
print("Stemmed:", stemmer.stem(word))
print("Lemmatized:", lemmatizer.lemmatize(word, pos="v"))

Stemmed: are
Lemmatized: be


## Part of speech tagging

In [31]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize

text = "Natural language processing is fun and has many applications."
words = word_tokenize(text)

print("POS Tags:", pos_tag(words))

POS Tags: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('fun', 'NN'), ('and', 'CC'), ('has', 'VBZ'), ('many', 'JJ'), ('applications', 'NNS'), ('.', '.')]


In [32]:
import spacy 
nlp = spacy.load("en_core_web_sm")

# Process the text
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

header = ["Token", "Lemma", "POS", "Tag", "Dependency", "Shape", "Is Alpha", "Is Stop"]
print("{:<10} {:<10} {:<7} {:<5} {:<10} {:<7} {:<9} {:<8}".format(*header))
print("="*75)

for token in doc:
    print("{:<10} {:<10} {:<7} {:<5} {:<10} {:<7} {:<9} {:<8}".format(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop))
    
# You can also get explanations for the tags
print("\nExplanation for 'NNP':", spacy.explain('NNP'))
print("Explanation for 'VBG':", spacy.explain('VBG'))
print("Explanation for 'IN':", spacy.explain('IN'))

Token      Lemma      POS     Tag   Dependency Shape   Is Alpha  Is Stop 
Apple      Apple      PROPN   NNP   nsubj      Xxxxx   1         0       
is         be         AUX     VBZ   aux        xx      1         1       
looking    look       VERB    VBG   ROOT       xxxx    1         0       
at         at         ADP     IN    prep       xx      1         1       
buying     buy        VERB    VBG   pcomp      xxxx    1         0       
U.K.       U.K.       PROPN   NNP   dobj       X.X.    0         0       
startup    startup    NOUN    NN    dep        xxxx    1         0       
for        for        ADP     IN    prep       xxx     1         1       
$          $          SYM     $     quantmod   $       0         0       
1          1          NUM     CD    compound   d       0         0       
billion    billion    NUM     CD    pobj       xxxx    1         0       

Explanation for 'NNP': noun, proper singular
Explanation for 'VBG': verb, gerund or present participle
Explanat

In [22]:
from spacy import displacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# Process your text
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

# Visualize POS tags
displacy.render(doc, style="dep", jupyter=True, options={'distance': 90})

## Named entity recognition

In [33]:
from nltk import ne_chunk, pos_tag
from nltk.tokenize import word_tokenize
from nltk.tree import Tree

def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []

    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

text = "Apple Inc. is an American multinational technology company headquartered in Cupertino, California."
print("Named Entities:", get_continuous_chunks(text))

Named Entities: ['Apple Inc.', 'American', 'Cupertino', 'California']


In [34]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
text = "Apple is looking at buying U.K. startup for $1 billion in June 2023."

doc = nlp(text)

colors = {
    "ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",  # Organizations
    "GPE": "linear-gradient(90deg, #fc9c9c, #fcaaaa)",  # Geopolitical entities
    "MONEY": "linear-gradient(90deg, #ffd700, #ffa500)",  # Money
    "DATE": "linear-gradient(90deg, #32cd32, #98fb98)"  # Dates
}

options = {
    "ents": ["ORG", "GPE", "MONEY", "DATE"],
    "colors": colors
}

displacy.render(doc, style="ent", options=options, jupyter=True)


See more on this also [here](https://demos.explosion.ai/displacy-ent).

## Sentiment Analysis

The sentiment property returns a namedtuple of the form Sentiment(polarity, subjectivity). The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

In [36]:
from textblob import TextBlob

text = "I hate natural language processing, because it's fascinating."
testimonial = TextBlob(text)
print("Polarity:", testimonial.sentiment.polarity)
print("Subjectivity:", testimonial.sentiment.subjectivity)

Polarity: -3.700743415417188e-17
Subjectivity: 0.7166666666666668
