# IMPORTING NECESSARY LIBRARIES

In [1]:
# run the next line only once if needed 
# This model includes word vectors and is useful for various natural language processing tasks.
!python -m spacy download en_core_web_lg 

# Importing the spaCy library
import spacy

# Loading the downloaded English language model for spaCy
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.0/587.7 MB 325.1 kB/s eta 0:30:08
     ---------------------------------------- 0.2/587.7 MB 1.3 MB/s eta 0:07:31
     ---------------------------------------- 0.7/587.7 MB 4.4 MB/s eta 0:02:15
     ---------------------------------------- 1.4/587.7 MB 6.7 MB/s eta 0:01:28
     ---------------------------------------- 2.0/587.7 MB 8.5 MB/s eta 0:01:10
     ---------------------------------------- 2.8/587.7 MB 9.8 MB/s eta 0:01:00
     --------------------------------------- 3.5/587.7 MB 10.5 MB/s eta 0:00:56
     --------------------------------------- 4.1/587.7 MB 10.8 MB/s eta 0:00:54
     --------------------------------

# TOKENIZATION ON SMALL TEXT

In [2]:
# Define a text string containing a sentence.
text = "My best friend Ryan Peters likes fancy adventure games."

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Iterate through each token in the processed document and print them with a delimiter "|".
for token in doc:
    print(token, end=' | ')

My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

# THE ATTRIBUTES THE SPACY ADDS

In [3]:
# Import the pandas library with alias pd for data manipulation and analysis.
import pandas as pd

# Define a function display_nlp that takes a spaCy document (doc) and an optional flag (include_punct) 
# to decide whether to include punctuation tokens in the output DataFrame.
def display_nlp(doc, include_punct=False):
    """
    Generate a DataFrame for visualization of spaCy tokens.
    
    Parameters:
        doc (spacy.Doc): The processed spaCy document.
        include_punct (bool): Flag to include punctuation tokens. Default is False.

    Returns:
        pd.DataFrame: DataFrame containing token information.
    """
    # Initialize an empty list to store rows of token information.
    rows = []
    
    # Iterate through each token in the spaCy document.
    for i, t in enumerate(doc):
        # Check if the token is not punctuation or if include_punct flag is True.
        if not t.is_punct or include_punct:
            # Create a dictionary containing token information.
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            # Append the dictionary to the rows list.
            rows.append(row)
    
    # Create a DataFrame from the list of token information rows.
    df = pd.DataFrame(rows).set_index('token')
    
    # Set the index name to None for cleaner display.
    df.index.name = None
    
    # Return the DataFrame.
    return df

# Call the display_nlp function with the spaCy document (doc) to generate a DataFrame
# for visualization of spaCy tokens.
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


# REMOVING STOPWORS

In [4]:
# Define a text string containing a sentence.
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Create a list comprehension to filter out tokens that are neither stop words nor punctuation.
non_stop = [t for t in doc if not t.is_stop and not t.is_punct]

# Print the list of non-stopword and non-punctuation tokens.
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


# FINDING ALL NOUNS

In [5]:
# Define a text string containing a sentence.
text = "My best friend Ryan Peters likes fancy adventure games."

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Create a list comprehension to filter out tokens that are nouns or proper nouns.
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]

# Print the list of tokens identified as nouns or proper nouns.
print(nouns)

[friend, Ryan, Peters, adventure, games]


# NAMED ENTITY RECOGNITION

In [6]:
# Define a text string containing a sentence.
text = "My best friend Ryan Peters likes fancy adventure games."

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Iterate through each named entity in the processed document and print its text and label.
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

# TRYING HARDER ONE

In [7]:
# Define a text string containing a sentence.
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco."

# Process the text using the spaCy language model loaded previously.
doc = nlp(text)

# Iterate through each named entity in the processed document and print its text and label.
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# VISUALIZING NERS

In [8]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the spaCy document (doc) with entity visualization style and display it in Jupyter Notebook.
displacy.render(doc, style='ent', jupyter=True)

# INSTALLING NECCESSARY LIBRARIES

In [9]:
!pip install html5lib
!pip install --upgrade pip setuptools
!pip install beautifulsoup4

Collecting pip
  Obtaining dependency information for pip from https://files.pythonhosted.org/packages/8a/6a/19e9fe04fca059ccf770861c7d5721ab4c2aebc539889e97c7977528a53b/pip-24.0-py3-none-any.whl.metadata
  Using cached pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Collecting setuptools
  Obtaining dependency information for setuptools from https://files.pythonhosted.org/packages/92/e1/1c8bb3420105e70bdf357d57dd5567202b4ef8d27f810e98bb962d950834/setuptools-69.2.0-py3-none-any.whl.metadata
  Using cached setuptools-69.2.0-py3-none-any.whl.metadata (6.3 kB)
Using cached pip-24.0-py3-none-any.whl (2.1 MB)
Using cached setuptools-69.2.0-py3-none-any.whl (821 kB)


ERROR: To modify pip, please run the following command:
C:\Users\DELL\anaconda3\python.exe -m pip install --upgrade pip setuptools




# LOADING DATASET

In [10]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import re

# Define a function to extract text content from a given URL
def url_to_string(url):
    # Send a GET request to the URL
    res = requests.get(url)
    # Extract the HTML content
    html = res.text
    # Create a BeautifulSoup object to parse the HTML
    soup = BeautifulSoup(html, 'html5lib')
    # Remove unwanted elements like scripts, styles, and asides
    for script in soup(["script", "style", 'aside']):
        script.extract()
    # Extract the text content and join it into a single string
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# Use the defined function to fetch content from a given URL (in this case, the New York Times trending page)
ny_bb = url_to_string('https://www.nytimes.com/trending/')

# Process the extracted text using the spaCy language model loaded previously
article = nlp(ny_bb)

# Calculate the number of named entities in the processed text
len(article.ents)

117

# HAVE A LOOK AT THE NERS

In [11]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the spaCy document (article) with entity visualization style and display it in Jupyter Notebook.
displacy.render(article, style='ent', jupyter=True)

# MOST POPULAR NER TYPES

In [12]:
# Download the large English language model for spaCy if it hasn't been downloaded already.
!python -m spacy download en_core_web_lg 

# Import the spaCy library
import spacy

# Load the downloaded English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# Import Counter from collections module to count occurrences of each element
from collections import Counter

# Extract the labels of named entities from the processed text and count their occurrences
labels = [x.label_ for x in article.ents]
counter = Counter(labels)

# Print the count of each entity label
print(counter)

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.0/587.7 MB 325.1 kB/s eta 0:30:08
     ---------------------------------------- 0.2/587.7 MB 1.3 MB/s eta 0:07:29
     ---------------------------------------- 0.7/587.7 MB 4.5 MB/s eta 0:02:11
     ---------------------------------------- 1.5/587.7 MB 6.6 MB/s eta 0:01:29
     ---------------------------------------- 2.2/587.7 MB 8.2 MB/s eta 0:01:12
     ---------------------------------------- 3.0/587.7 MB 9.5 MB/s eta 0:01:02
     --------------------------------------- 3.7/587.7 MB 10.3 MB/s eta 0:00:57
     --------------------------------------- 4.4/587.7 MB 10.7 MB/s eta 0:00:55
     --------------------------------

Counter({'ORG': 37, 'PERSON': 19, 'GPE': 18, 'CARDINAL': 12, 'DATE': 9, 'MONEY': 7, 'PRODUCT': 4, 'WORK_OF_ART': 4, 'NORP': 4, 'TIME': 1, 'ORDINAL': 1, 'FAC': 1})


# MOST POPULAR NER

In [13]:
# Extract the text of named entities from the processed text and count their occurrences
items = [x.text for x in article.ents]
counter_items = Counter(items)

# Get the five most common named entities and their counts
most_common_entities = counter_items.most_common(5)

# Print the result
print(most_common_entities)

[('New York', 3), ('4', 3), ('Idaho', 2), ('56 Pounds', 2), ('Italian', 2)]


# PRINTING SENTENCES

In [14]:
# Import the spaCy library
import spacy

# Load the English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# Extract sentences containing named entities from the processed text
sentences = [x for x in article.ents]

# Print the 21st sentence containing named entities
print(sentences[30])

2:27Key


# NER TAGS

In [15]:
# Import the displacy module from spaCy for entity visualization.
from spacy import displacy

# Render the 21st sentence containing named entities with entity visualization style
displacy.render(nlp(str(sentences[30])), jupyter=True, style='ent')

# TYPES OF WORDS IN THE SENTENCE

In [16]:
# Extract non-stopword and non-punctuation tokens from the 21st sentence containing named entities
tokens_info = [(x.orth_, x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[30])) if not y.is_stop and y.pos_ != 'PUNCT']]

# Print the tokens' orthographic form, part-of-speech, and lemma
print(tokens_info)

[('2:27Key', 'NUM', '2:27key')]


# SENTENCE DEPENDENCY TREE

In [17]:
# Import the displacy module from spaCy for dependency visualization.
from spacy import displacy

# Render the dependency parse of the 21st sentence containing named entities and display it in Jupyter Notebook.
# Adjust the distance between tokens for better visualization.
displacy.render(nlp(str(sentences[30])), style='dep', jupyter=True, options={'distance': 120})