In [1]:
import nltk

nltk.download('punkt')                         # For tokenizing sentences/words (A pretrained unsupervised sentence tokenizer model)
nltk.download('stopwords')                     # List of common stopwords 
nltk.download('averaged_perceptron_tagger')    # POS tagger
nltk.download('wordnet')                       # Lexical database for lemmatization
nltk.download('maxent_ne_chunker')             # Named Entity Recognition (NER) chunker
nltk.download('words')                         # List of English words (used in NER)
nltk.download('treebank')                      # Penn Treebank corpus: tagged & parsed sentences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sadin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sadin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sadin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sadin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\sadin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sadin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-

True

## Web Scrapping

In [4]:
!pip install requests beautifulsoup4 nltk matplotlib



In [5]:
url = "https://en.wikipedia.org/wiki/Natural_language_processing"

In [8]:
import requests
from bs4 import BeautifulSoup

In [9]:
response = requests.get(url)

In [10]:
soup = BeautifulSoup(response.content, 'html.parser')

In [15]:
text=" ".join([p.text.strip() for p in soup.find_all('p')])

In [19]:
text[:1000]

'Natural language processing (NLP) is the processing of natural language information by a computer. The study of NLP, a subfield of computer science, is generally associated with artificial intelligence. NLP is related to information retrieval, knowledge representation, computational linguistics, and more broadly with linguistics.[1]\n Major processing tasks in an NLP system include: speech recognition, text classification, natural language understanding, and natural language generation.\n Natural language processing has its roots in the 1950s.[2] Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpretation and generation of natural language.\n The premise of symbolic NLP is well-summarized by John Se

## Text preprocessing

In [20]:
# Import required libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [21]:
# Preprocess text - tokenize,lowercase, remove stopwords, lemmatize
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [30]:
tokens_without_stop=[i for i in tokens if i not in stop_words and i.isalpha()]

In [31]:
filtered_tokens=[lemmatizer.lemmatize(i) for i in tokens_without_stop]

In [32]:
filtered_tokens

['natural',
 'language',
 'processing',
 'nlp',
 'processing',
 'natural',
 'language',
 'information',
 'computer',
 'study',
 'nlp',
 'subfield',
 'computer',
 'science',
 'generally',
 'associated',
 'artificial',
 'intelligence',
 'nlp',
 'related',
 'information',
 'retrieval',
 'knowledge',
 'representation',
 'computational',
 'linguistics',
 'broadly',
 'linguistics',
 'major',
 'processing',
 'task',
 'nlp',
 'system',
 'include',
 'speech',
 'recognition',
 'text',
 'classification',
 'natural',
 'language',
 'understanding',
 'natural',
 'language',
 'generation',
 'natural',
 'language',
 'processing',
 'root',
 'already',
 'alan',
 'turing',
 'published',
 'article',
 'titled',
 'computing',
 'machinery',
 'intelligence',
 'proposed',
 'called',
 'turing',
 'test',
 'criterion',
 'intelligence',
 'though',
 'time',
 'articulated',
 'problem',
 'separate',
 'artificial',
 'intelligence',
 'proposed',
 'test',
 'includes',
 'task',
 'involves',
 'automated',
 'interpretation

In [33]:
len(filtered_tokens)

512

In [34]:
len(set(filtered_tokens))

310

## POS Tagging

In [38]:
import random
from nltk.corpus import treebank

# Load treebank corpus and convert to list for shuffling
tagged_sentences = list(treebank.tagged_sents())
random.shuffle(tagged_sentences)
spilt = int(len(tagged_sentences) * 0.8)
train_data = tagged_sentences[:spilt]
test_data = tagged_sentences[spilt:]

print(f"Total sentences: {len(tagged_sentences)}")
print(f"Train sentences: {len(train_data)}")
print(f"Test sentences: {len(test_data)}")

# Optional: Show a sample sentence
print(f"\nSample tagged sentence: {train_data[0]}")

Total sentences: 3914
Train sentences: 3131
Test sentences: 783

Sample tagged sentence: [('Both', 'DT'), ('Newsweek', 'NNP'), ('and', 'CC'), ('U.S.', 'NNP'), ('News', 'NNP'), ('have', 'VBP'), ('been', 'VBN'), ('gaining', 'VBG'), ('circulation', 'NN'), ('in', 'IN'), ('recent', 'JJ'), ('years', 'NNS'), ('without', 'IN'), ('heavy', 'JJ'), ('use', 'NN'), ('of', 'IN'), ('electronic', 'JJ'), ('giveaways', 'NNS'), ('to', 'TO'), ('subscribers', 'NNS'), (',', ','), ('such', 'JJ'), ('as', 'IN'), ('telephones', 'NNS'), ('or', 'CC'), ('watches', 'NNS'), ('.', '.')]
