<a href="https://colab.research.google.com/github/vigneshpatel14/NLP/blob/main/NLP_Live_Session_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""A Simple Script for Extracting Data from a Webpage
This script allows the user to extract data from a webapge and then export the data to a csv file with column(s).
"""
# libraries
import urllib.request
from bs4 import BeautifulSoup
import csv




# Put your URL here
url = 'https://www.nytimes.com/books/best-sellers/combined-print-and-e-book-nonfiction/'

# Fetching the html
request = urllib.request.Request(url)
content = urllib.request.urlopen(request)
# Parsing the html
parse = BeautifulSoup(content, 'html.parser')

# Provide html elements' attributes to extract the data
print(parse)

text1 = parse.find_all('h3', attrs={'class': 'css-5pe77f'})
text2 = parse.find_all('p', attrs={'class': 'css-hjukut'})

# Writing extracted data in a csv file
with open('index.csv', 'a') as csv_file:
  writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
  writer.writerow(['Title','Author'])
  for col1,col2 in zip(text1, text2):
    writer.writerow([col1.get_text().strip(), col2.get_text().strip()])

<!DOCTYPE html>

<html lang="en" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<meta charset="utf-8"/>
<title data-rh="true">Combined Print &amp; E-Book Nonfiction - Best Sellers - Books</title>
<meta content="The New York Times Best Sellers are up-to-date and authoritative lists of the most popular books in the United States, based on sales in the past week, including fiction, non-fiction, paperbacks, children’s books, audiobooks, graphic books and more." data-rh="true" name="description"/><meta content="noarchive" data-rh="true" name="robots"/><meta content="Combined Print &amp; E-Book Nonfiction - Best Sellers - Books - The New York Times" data-rh="true" property="twitter:title"/><meta content="https://www.nytimes.com/books/best-sellers/combined-print-and-e-book-nonfiction/" data-rh="true" property="twitter:url"/><meta content="The New York Times Best Sellers are up-to-date and authoritative lists of the most popular books in the United States, based on sales in the past w

TOKENIZATION And TOKENIZERS

In [None]:
!pip install nltk spacy



In [None]:
#nltk setup
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize, word_tokenize

#spaCy setup
import spacy
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
#Basic NTLK Examples

text = "Mary had a little lamb. Her fleece was white as snow. Hello everyone this is a lecture on NLP"


#1 Sentence Tokenizer

sentences = sent_tokenize(text)
print("Sentences:")

for i, sent in enumerate(sentences):
  print(f"Sentence {i}, {sent}")

Sentences:
Sentence 0, Mary had a little lamb.
Sentence 1, Her fleece was white as snow.
Sentence 2, Hello everyone this is a lecture on NLP


In [None]:
#word tokenization

words = word_tokenize(text)
print("Words:")
print(words)

Words:
['Mary', 'had', 'a', 'little', 'lamb', '.', 'Her', 'fleece', 'was', 'white', 'as', 'snow', '.', 'Hello', 'everyone', 'this', 'is', 'a', 'lecture', 'on', 'NLP']


In [None]:
#analyzing the tokens

from nltk import pos_tag

#parts of speech tagging

tagged = pos_tag(words)

print("\n Tokens with POS Tags:")

for token, tag in tagged:
  print(f"{token} - {tag}")


 Tokens with POS Tags:
Mary - NNP
had - VBD
a - DT
little - JJ
lamb - NN
. - .
Her - PRP$
fleece - NN
was - VBD
white - JJ
as - IN
snow - NN
. - .
Hello - NNP
everyone - NN
this - DT
is - VBZ
a - DT
lecture - NN
on - IN
NLP - NNP


In [None]:
#4 working with specific patterns

text_with_numbers = "The year 2024 saw temperatures of 42.5°C in summer."

number_tokens = word_tokenize(text_with_numbers)

print("Tokenization with numbers: ", number_tokens)

Tokenization with numbers:  ['The', 'year', '2024', 'saw', 'temperatures', 'of', '42.5°C', 'in', 'summer', '.']


In [None]:
#Basic spaCy example

text = """Hello everyone! We're learning about tokenization.
Mr. Smith bought a car, for $60,000.50 and paid in full."""

# spacy doc
doc = nlp(text)


# sentence tokenization

print("Sentences:")

for i, sent in enumerate(doc.sents, 1):
  print(f"Sentence {i}, {sent}")

Sentences:
Sentence 1, Hello everyone!
Sentence 2, We're learning about tokenization. 

Sentence 3, Mr. Smith bought a car, for $60,000.50 and paid in full.


In [None]:
#accessing tokens

print("Tokens with attributes:")

for token in doc:
  print(f"""

  Token: {token}
  POS: {token.pos_}
  Tag: {token.tag_}
  Is number: {token.like_num}
  Is space: {token.is_space}
  Is punct: {token.is_punct}
  """)

print(doc)

Tokens with attributes:

  
  Token: Hello
  POS: INTJ
  Tag: UH
  Is number: False
  Is space: False
  Is punct: False
  

  
  Token: everyone
  POS: PRON
  Tag: NN
  Is number: False
  Is space: False
  Is punct: False
  

  
  Token: !
  POS: PUNCT
  Tag: .
  Is number: False
  Is space: False
  Is punct: True
  

  
  Token: We
  POS: PRON
  Tag: PRP
  Is number: False
  Is space: False
  Is punct: False
  

  
  Token: 're
  POS: AUX
  Tag: VBP
  Is number: False
  Is space: False
  Is punct: False
  

  
  Token: learning
  POS: VERB
  Tag: VBG
  Is number: False
  Is space: False
  Is punct: False
  

  
  Token: about
  POS: ADP
  Tag: IN
  Is number: False
  Is space: False
  Is punct: False
  

  
  Token: tokenization
  POS: NOUN
  Tag: NN
  Is number: False
  Is space: False
  Is punct: False
  

  
  Token: .
  POS: PUNCT
  Tag: .
  Is number: False
  Is space: False
  Is punct: True
  

  
  Token: 

  POS: SPACE
  Tag: _SP
  Is number: False
  Is space: True
  Is punct:

In [None]:
#Advanced examples
from nltk.tokenize import RegexpTokenizer

# Custom tokenizer

custom_tokenizer = RegexpTokenizer(r'\w+|[$€¥.]+|\S+')

text = "User@example.com paid $50.00 for the item!"

custom_tokens = custom_tokenizer.tokenize(text)

print(custom_tokens)

['User', '@example.com', 'paid', '$', '50', '.', '00', 'for', 'the', 'item', '!']


In [None]:
# tweet tokenization
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

tweet = "@user This is sooo coooool!!! #NLP #Python https://example.com"

tweet_tokens = tweet_tokenizer.tokenize(tweet)

print(tweet_tokens)

['This', 'is', 'sooo', 'coool', '!', '!', '!', '#NLP', '#Python', 'https://example.com']
