In [1]:
"""A Simple Script for Extracting Data from a Webpage
This script allows the user to extract data from a webapge and then export the data to a csv file with column(s).
"""
# libraries
import urllib.request
from bs4 import BeautifulSoup
import csv




# Put your URL here
url = 'https://www.nytimes.com/books/best-sellers/combined-print-and-e-book-nonfiction/'

# Fetching the html
request = urllib.request.Request(url)
content = urllib.request.urlopen(request)
# Parsing the html
parse = BeautifulSoup(content, 'html.parser')

# Provide html elements' attributes to extract the data
print(parse)

text1 = parse.find_all('h3', attrs={'class': 'css-5pe77f'})
text2 = parse.find_all('p', attrs={'class': 'css-hjukut'})

# Writing extracted data in a csv file
with open('index.csv', 'a') as csv_file:
  writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
  writer.writerow(['Title','Author'])
  for col1,col2 in zip(text1, text2):
    writer.writerow([col1.get_text().strip(), col2.get_text().strip()])

<!DOCTYPE html>

<html lang="en" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<meta charset="utf-8"/>
<title data-rh="true">Combined Print &amp; E-Book Nonfiction - Best Sellers - Books</title>
<meta content="The New York Times Best Sellers are up-to-date and authoritative lists of the most popular books in the United States, based on sales in the past week, including fiction, non-fiction, paperbacks, children’s books, audiobooks, graphic books and more." data-rh="true" name="description"/><meta content="noarchive" data-rh="true" name="robots"/><meta content="Combined Print &amp; E-Book Nonfiction - Best Sellers - Books - The New York Times" data-rh="true" property="twitter:title"/><meta content="https://www.nytimes.com/books/best-sellers/combined-print-and-e-book-nonfiction/" data-rh="true" property="twitter:url"/><meta content="The New York Times Best Sellers are up-to-date and authoritative lists of the most popular books in the United States, based on sales in the past w

In [2]:
!pip install nlTk spacy



In [3]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize, word_tokenize


#Spacy
import spacy
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
text = "The quick brown fox jumps over the lazy dog . This is a text from NLP"

#1.Sentence Tokenizer

sentences = sent_tokenize(text)
print("sentences")

for i , sent in enumerate(sentences):
  print(f'Sentence {i+1} : {sent}')

sentences
Sentence 1 : The quick brown fox jumps over the lazy dog .
Sentence 2 : This is a text from NLP


In [6]:
words = word_tokenize(text)
print("words")

for i , word in enumerate(words):
  print(f'Word {i+1} : {word}')

words
Word 1 : The
Word 2 : quick
Word 3 : brown
Word 4 : fox
Word 5 : jumps
Word 6 : over
Word 7 : the
Word 8 : lazy
Word 9 : dog
Word 10 : .
Word 11 : This
Word 12 : is
Word 13 : a
Word 14 : text
Word 15 : from
Word 16 : NLP


In [7]:
from nltk import pos_tag

tagged = pos_tag(words)
print(tagged)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('text', 'NN'), ('from', 'IN'), ('NLP', 'NNP')]


In [8]:
for token , tag in tagged:
  print(f'{token} -- {tag}')

The -- DT
quick -- JJ
brown -- NN
fox -- NN
jumps -- VBZ
over -- IN
the -- DT
lazy -- JJ
dog -- NN
. -- .
This -- DT
is -- VBZ
a -- DT
text -- NN
from -- IN
NLP -- NNP


In [9]:
#Identify specific patterns

text_with_numbers = "The year 2024 saw temperature of 42.5c in summer."

words_with_numbers = word_tokenize(text_with_numbers)
print(words_with_numbers)

['The', 'year', '2024', 'saw', 'temperature', 'of', '42.5c', 'in', 'summer', '.']


In [10]:
#Spacy


text = """Hello everyone! We're learning about tokenization.
Mr. Smith bought a car, for $60,000.50 and paid in full."""


doc = nlp(text)


for i , sent in enumerate(doc.sents , 1): #It starts from 1 index
  print(f'Sentence {i} : {sent}')

Sentence 1 : Hello everyone!
Sentence 2 : We're learning about tokenization.

Sentence 3 : Mr. Smith bought a car, for $60,000.50 and paid in full.


In [11]:
print("Tokens with attributes:")

for token in doc:
  print(f"""

  Token: {token}
  POS: {token.pos_}
  Tag: {token.tag_}
  Is number: {token.like_num}
  Is space: {token.is_space}
  Is punct: {token.is_punct}
  """)

print(doc)

Tokens with attributes:


  Token: Hello
  POS: INTJ
  Tag: UH
  Is number: False
  Is space: False
  Is punct: False
  


  Token: everyone
  POS: PRON
  Tag: NN
  Is number: False
  Is space: False
  Is punct: False
  


  Token: !
  POS: PUNCT
  Tag: .
  Is number: False
  Is space: False
  Is punct: True
  


  Token: We
  POS: PRON
  Tag: PRP
  Is number: False
  Is space: False
  Is punct: False
  


  Token: 're
  POS: AUX
  Tag: VBP
  Is number: False
  Is space: False
  Is punct: False
  


  Token: learning
  POS: VERB
  Tag: VBG
  Is number: False
  Is space: False
  Is punct: False
  


  Token: about
  POS: ADP
  Tag: IN
  Is number: False
  Is space: False
  Is punct: False
  


  Token: tokenization
  POS: NOUN
  Tag: NN
  Is number: False
  Is space: False
  Is punct: False
  


  Token: .
  POS: PUNCT
  Tag: .
  Is number: False
  Is space: False
  Is punct: True
  


  Token: 

  POS: SPACE
  Tag: _SP
  Is number: False
  Is space: True
  Is punct: False
  


  Token:

In [12]:
for i in doc.sents:
  print(i)

Hello everyone!
We're learning about tokenization.

Mr. Smith bought a car, for $60,000.50 and paid in full.


In [13]:
from nltk.tokenize import RegexpTokenizer

#Custom Tokenizer

custom_tokenizer = RegexpTokenizer(r'\w+|[$€¥.]+|\S+')

text = "User@example.com paid $50.00 for the item!"

custom_tokens = custom_tokenizer.tokenize(text)

print(custom_tokens)

['User', '@example.com', 'paid', '$', '50', '.', '00', 'for', 'the', 'item', '!']


In [15]:
#Tweet tokenization

from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer(strip_handles = True , reduce_len = True)

tweet = "@user This is sooo coooool!!! #NLP #Python https://example.com"

tweet_tokens = tweet_tokenizer.tokenize(tweet)

print(tweet_tokens)

['This', 'is', 'sooo', 'coool', '!', '!', '!', '#NLP', '#Python', 'https://example.com']
