In [None]:
#1. NLP Preprocessing Techniques Implementation
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text
text = "Natural Language Processing (NLP) is an exciting field of study!"

# 1. Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

# 2. Punctuation Removal
tokens = [word for word in tokens if word not in string.punctuation]
print("Tokens after punctuation removal:", tokens)

# 3. Stop Words Removal
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.lower() not in stop_words]
print("Tokens after stop words removal:", tokens)

# 4. Stemming
ps = PorterStemmer()
stemmed_tokens = [ps.stem(word) for word in tokens]
print("Stemmed Tokens:", stemmed_tokens)

# 5. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatized Tokens:", lemmatized_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'an', 'exciting', 'field', 'of', 'study', '!']
Tokens after punctuation removal: ['Natural', 'Language', 'Processing', 'NLP', 'is', 'an', 'exciting', 'field', 'of', 'study']
Tokens after stop words removal: ['Natural', 'Language', 'Processing', 'NLP', 'exciting', 'field', 'study']
Stemmed Tokens: ['natur', 'languag', 'process', 'nlp', 'excit', 'field', 'studi']
Lemmatized Tokens: ['Natural', 'Language', 'Processing', 'NLP', 'exciting', 'field', 'study']


In [None]:
#2. Use of Named Entity Recognition (NER) Information Extraction Technique
 !pip install spacy
!python -m spacy download en_core_web_sm

import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Barack Obama was the 44th President of the United States. He was born in Hawaii."

# Process the text with spaCy
doc = nlp(text)

# Extract named entities
print("Named Entities, Phrases, and Concepts:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Named Entities, Phrases, and Concepts:
Barack Obama (PERSON)
44th (ORDINAL)
the United States (GPE)
Hawaii (GPE)


In [1]:
#3. Implement POS Tagging
import nltk
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('punkt')  # For tokenization
nltk.download('averaged_perceptron_tagger')  # For POS tagging

# Sample text for POS tagging
text = "The quick brown fox jumps over the lazy dog."

# Step 1: Tokenization
# Tokenizing the text into words
tokens = word_tokenize(text)

# Step 2: POS Tagging
# Performing POS tagging on the tokenized words
pos_tags = nltk.pos_tag(tokens)

# Step 3: Displaying the POS tags
print("Part-of-Speech Tags:")
for word, tag in pos_tags:
    print(f"{word}: {tag}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Part-of-Speech Tags:
The: DT
quick: JJ
brown: NN
fox: NN
jumps: VBZ
over: IN
the: DT
lazy: JJ
dog: NN
.: .


In [None]:
#4. Implement N-Gram Model (Virtual Lab)
import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

# Download required NLTK resources
nltk.download('punkt')

# Sample text for N-gram modeling
text = "Natural language processing is an exciting field. Natural language understanding is a part of NLP."

# Step 1: Tokenization
# Tokenizing the text into words and converting to lowercase
tokens = word_tokenize(text.lower())  # Lowercasing for uniformity

# Step 2: Create Bigrams
# Generating bigrams from the tokenized words
bigrams = ngrams(tokens, 2)

# Step 3: Count Frequency of Bigrams
# Using Counter to count occurrences of each bigram
bigram_freq = Counter(bigrams)

# Step 4: Display Bigrams and their Frequencies
print("Bigrams and their Frequencies:")
for bigram, freq in bigram_freq.items():
    print(f"{bigram}: {freq}")


Bigrams and their Frequencies:
('natural', 'language'): 2
('language', 'processing'): 1
('processing', 'is'): 1
('is', 'an'): 1
('an', 'exciting'): 1
('exciting', 'field'): 1
('field', '.'): 1
('.', 'natural'): 1
('language', 'understanding'): 1
('understanding', 'is'): 1
('is', 'a'): 1
('a', 'part'): 1
('part', 'of'): 1
('of', 'nlp'): 1
('nlp', '.'): 1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#5. Implement a Code for Aspect Mining
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

# Download required NLTK resources
nltk.download('vader_lexicon')

# Sample reviews data
reviews = [
    "The food was amazing but the service was slow.",
    "I love the ambiance, but the food was overpriced.",
    "Great service and delicious food!",
    "The restaurant is beautiful, but the wait was too long."
]

# Step 1: Define aspects
aspects = ["food", "service", "ambiance", "wait"]

# Step 2: Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Step 3: Extract aspects and determine sentiment
aspect_sentiment = {}

for review in reviews:
    for aspect in aspects:
        if aspect in review.lower():
            sentiment_score = sia.polarity_scores(review)["compound"]
            sentiment = "positive" if sentiment_score > 0 else "negative" if sentiment_score < 0 else "neutral"
            if aspect not in aspect_sentiment:
                aspect_sentiment[aspect] = []
            aspect_sentiment[aspect].append((review, sentiment))

# Step 4: Display results
for aspect, sentiments in aspect_sentiment.items():
    print(f"Aspects: {aspect}")
    for review, sentiment in sentiments:
        print(f" - Review: '{review}' | Sentiment: {sentiment}")


Aspects: food
 - Review: 'The food was amazing but the service was slow.' | Sentiment: positive
 - Review: 'I love the ambiance, but the food was overpriced.' | Sentiment: positive
 - Review: 'Great service and delicious food!' | Sentiment: positive
Aspects: service
 - Review: 'The food was amazing but the service was slow.' | Sentiment: positive
 - Review: 'Great service and delicious food!' | Sentiment: positive
Aspects: ambiance
 - Review: 'I love the ambiance, but the food was overpriced.' | Sentiment: positive
Aspects: wait
 - Review: 'The restaurant is beautiful, but the wait was too long.' | Sentiment: positive


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [2]:
#6.a) Use of NLP Techniques for Text Summarization
!pip install sumy

import nltk
nltk.download('punkt')

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

# Sample text
text = """
Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction
between computers and humans through natural language. The ultimate goal of NLP is to enable computers to
understand, interpret, and generate human language in a valuable way. NLP has a wide range of applications,
including text analysis, machine translation, sentiment analysis, and chatbots.
"""

parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = LsaSummarizer()

# Generate summary
summary = summarizer(parser.document, 2)  # Number of sentences in summary

print("Original Text:")
print(text)
print("\nSummarized Text:")
for sentence in summary:
    print(sentence)


Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Original Text:

Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction
between computers and humans through natural language. The ultimate goal of NLP is to enable computers to
understand, interpret, and generate human language in a valuable way. NLP has a wide range of applications,
including text analysis, machine translation, sentiment analysis, and chatbots.


Summarized Text:
Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language.
The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language in a valuable way.


In [None]:
#6.b) Use of NLP Techniques for Text Classification
!pip install scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Sample dataset for classification
data = [
    ("I love programming in Python", "positive"),
    ("Python is great for data science", "positive"),
    ("I dislike bugs in the code", "negative"),
    ("Debugging is frustrating", "negative"),
    ("The syntax of Python is easy to learn", "positive"),
    ("I hate when the code doesn't work", "negative"),
]

# Step 1: Prepare data
texts, labels = zip(*data)

# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# Step 3: Create a pipeline with CountVectorizer and Naive Bayes
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Step 4: Train the model
model.fit(X_train, y_train)

# Step 5: Predict the labels for the test set
predicted_labels = model.predict(X_test)

# Step 6: Evaluate the model
accuracy = metrics.accuracy_score(y_test, predicted_labels)
print("\nPredicted Labels:", predicted_labels)
print("Accuracy:", accuracy)



Predicted Labels: ['negative' 'positive']
Accuracy: 0.5


In [None]:
#7. Implement Simple Machine Translation from One Language to Another
!pip install transformers torch
from transformers import MarianMTModel, MarianTokenizer

# Step 1: Define the translation model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-fr'  # English to French
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Step 2: Define the text to be translated
text_to_translate = "Machine translation is a fascinating field of artificial intelligence."

# Step 3: Tokenize the input text
tokenized_text = tokenizer(text_to_translate, return_tensors="pt")

# Step 4: Perform the translation
translated_tokens = model.generate(**tokenized_text)

# Step 5: Decode the translated tokens
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

# Output the results
print("Original Text:", text_to_translate)
print("Translated Text:", translated_text)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Original Text: Machine translation is a fascinating field of artificial intelligence.
Translated Text: La traduction automatique est un domaine fascinant de l'intelligence artificielle.


In [None]:
#8. Implement Sentiment Analysis Technique for Classifying Data into Positive, Negative, or Neutral Classes
!pip install scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Sample dataset for sentiment analysis
data = [
    ("I love this product!", "positive"),
    ("This is the worst experience I ever had.", "negative"),
    ("It's okay, neither good nor bad.", "neutral"),
    ("Absolutely fantastic service!", "positive"),
    ("I wouldn't recommend this to anyone.", "negative"),
    ("Just average, nothing special.", "neutral"),
]

# Step 1: Prepare data
texts, labels = zip(*data)

# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# Step 3: Create a pipeline with CountVectorizer and Naive Bayes
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Step 4: Train the model
model.fit(X_train, y_train)

# Step 5: Predict the labels for the test set
predicted_labels = model.predict(X_test)

# Step 6: Evaluate the model
accuracy = metrics.accuracy_score(y_test, predicted_labels)
print("\nPredicted Labels:", predicted_labels)
print("Accuracy:", accuracy)

# Step 7: Display actual vs predicted labels
for text, actual, predicted in zip(X_test, y_test, predicted_labels):
    print(f"Text: '{text}' | Actual: {actual} | Predicted: {predicted}")



Predicted Labels: ['negative' 'negative']
Accuracy: 0.5
Text: 'I love this product!' | Actual: positive | Predicted: negative
Text: 'This is the worst experience I ever had.' | Actual: negative | Predicted: negative


In [None]:
#9. Tokenize a Text Using the transformers Package and Translate the Text Using Simple Transformers
!pip install transformers torch
from transformers import MarianMTModel, MarianTokenizer

# Step 1: Define the translation model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-fr'  # Model for English to French translation
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Step 2: Define the text to be translated
text_to_translate = "The weather today is beautiful and sunny."

# Step 3: Tokenize the input text
tokenized_text = tokenizer(text_to_translate, return_tensors="pt")

# Output tokenized input
print("Tokenized Input IDs:", tokenized_text['input_ids'])
print("Tokenized Attention Mask:", tokenized_text['attention_mask'])

# Step 4: Perform the translation
translated_tokens = model.generate(**tokenized_text)

# Step 5: Decode the translated tokens
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

# Output the results
print("\nOriginal Text:", text_to_translate)
print("Translated Text:", translated_text)






Tokenized Input IDs: tensor([[   35,  6384,  1394,    32,  3400,    10, 30651,     3,     0]])
Tokenized Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])

Original Text: The weather today is beautiful and sunny.
Translated Text: Le temps d'aujourd'hui est beau et ensoleillé.


In [3]:
!pip install gensim
!pip install transformers
from transformers import pipeline

# Initialize the Hugging Face summarization pipeline
summarizer = pipeline("summarization")

# Sample text for summarization
text = """
Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction
between computers and humans through natural language. The ultimate goal of NLP is to enable computers to
understand, interpret, and generate human language in a valuable way. NLP has a wide range of applications,
including text analysis, machine translation, sentiment analysis, and chatbots.
"""

# Perform summarization
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)

# Output the results
print("Original Text:")
print(text)
print("\nSummarized Text:")
print(summary[0]['summary_text'])



No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



Original Text:

Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction 
between computers and humans through natural language. The ultimate goal of NLP is to enable computers to 
understand, interpret, and generate human language in a valuable way. NLP has a wide range of applications, 
including text analysis, machine translation, sentiment analysis, and chatbots.


Summarized Text:
 Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language . The ultimate goal of NLP is to enable computers to understand, interpret, and generate human


# New Section