# NLP BASICS

### Tokenization,Stopword Removal,Stemming

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Sample text for processing
text = "Natural Language Processing is a subfield of artificial intelligence that focuses on the interaction between computers and human language."

# Tokenization: Break the text into words or tokens
tokens = word_tokenize(text)

# Stopword Removal: Remove common words (e.g., 'the', 'is') that don't carry much meaning
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming: Reduce words to their root form using the Porter Stemmer
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Print the results
print("Original Text:")
print(text)
print("\nTokenization:")
print(tokens)
print("\nStopword Removal:")
print(filtered_tokens)
print("\nStemming:")
print(stemmed_tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rgc11\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rgc11\AppData\Roaming\nltk_data...


Original Text:
Natural Language Processing is a subfield of artificial intelligence that focuses on the interaction between computers and human language.

Tokenization:
['Natural', 'Language', 'Processing', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'human', 'language', '.']

Stopword Removal:
['Natural', 'Language', 'Processing', 'subfield', 'artificial', 'intelligence', 'focuses', 'interaction', 'computers', 'human', 'language', '.']

Stemming:
['natur', 'languag', 'process', 'subfield', 'artifici', 'intellig', 'focus', 'interact', 'comput', 'human', 'languag', '.']


[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
# pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     -------------------------------------- 636.8/636.8 kB 1.9 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Note: you may need to restart the kernel to use updated packages.


In [6]:
# !python -m textblob.download_corpora


Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\rgc11\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rgc11\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rgc11\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rgc11\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\rgc11\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\rgc11\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


# SENTIMENTAL ANALYSIS - SENTENCE IS POSITIVE OR NEGATIVE

In [9]:
from textblob import TextBlob

# Sample text for sentiment analysis
text = "I ordered four products on Flipkart on the 12th of September 2023. However, my orders were never delivered. When I checked the order status on the website it shows that my orders were delivered when I in fact never received anything from Flipkart. I tried contacting Flipkart on several communication platforms to no avail. What is even worse is that my money was deducted. The customer service is horrible, and if I could rate my experience below 1 star, I would"

# Create a TextBlob object
blob = TextBlob(text)

# Perform sentiment analysis
sentiment = blob.sentiment

# Interpret the sentiment
if sentiment.polarity > 0:
    sentiment_label = "Positive"
elif sentiment.polarity < 0:
    sentiment_label = "Negative"
else:
    sentiment_label = "Neutral"

# Print the results
print("Text:", text)
print("Sentiment:", sentiment_label)
print("Polarity (Positive/Negative):", sentiment.polarity)
print("Subjectivity (Objective/Subjective):", sentiment.subjectivity)


Text: I ordered four products on Flipkart on the 12th of September 2023. However, my orders were never delivered. When I checked the order status on the website it shows that my orders were delivered when I in fact never received anything from Flipkart. I tried contacting Flipkart on several communication platforms to no avail. What is even worse is that my money was deducted. The customer service is horrible, and if I could rate my experience below 1 star, I would
Sentiment: Negative
Polarity (Positive/Negative): -0.4666666666666666
Subjectivity (Objective/Subjective): 0.5333333333333333


# code for text classification -positive or negative

In [10]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Sample dataset (you can replace this with your own dataset)
reviews = [
    ("I loved this movie, it's fantastic!", "positive"),
    ("Terrible movie, waste of time.", "negative"),
    ("The acting was great, but the plot was weak.", "negative"),
    ("Highly recommended! Must watch!", "positive"),
    ("The story was engaging, and the characters were well-developed.", "positive"),
    ("I couldn't finish watching this. It was that bad.", "negative"),
    ("A masterpiece of filmmaking. I was moved to tears.", "positive"),
    ("The special effects were impressive, but the dialogue was cheesy.", "negative"),
    ("It's an average movie, nothing special.", "neutral"),
    ("This film exceeded my expectations. I'm impressed.", "positive"),
    ("Boring and predictable. I wouldn't recommend it.", "negative"),
    ("I have mixed feelings about this movie.", "neutral"),
    ("An instant classic! I'll watch it again.", "positive"),
    ("The plot twists were unexpected and thrilling.", "positive"),
    ("I expected better. It was a letdown.", "negative"),
    ("A fun and entertaining movie for the whole family.", "positive"),
    ("I fell asleep during this movie. It was that boring.", "negative"),
]


# Separate text and labels
texts, labels = zip(*reviews)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a classifier (in this case, a Naive Bayes classifier)
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_vec)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.25
Classification Report:
               precision    recall  f1-score   support

    negative       0.33      0.50      0.40         2
    positive       0.00      0.00      0.00         2

    accuracy                           0.25         4
   macro avg       0.17      0.25      0.20         4
weighted avg       0.17      0.25      0.20         4



# IDENTIFY NAME, PLACE, ORGANISATION NAME FROM TEXT ENTERED USING SPACY

In [12]:
# pip install spacy


Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.6.1-cp39-cp39-win_amd64.whl (12.1 MB)
     ---------------------------------------- 12.1/12.1 MB 5.4 MB/s eta 0:00:00
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.12-cp39-cp39-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 4.7 MB/s eta 0:00:00
Collecting spacy-legacy<3.1.0,>=3.0.11
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting typer<0.10.0,>=0.3.0
  Using cached typer-0.9.0-py3-none-any.whl (45 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.9-cp39-cp39-win_amd64.whl (122 kB)
     -------------------------------------- 122.7/122.7 kB 7.0 MB/s eta 0:00:00
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.10-cp39-cp39-win_amd64.whl (25 kB)
Collecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4
  Downloading pydantic-2.4.2-py3-none-any.whl (395 kB)
     --------------------------



In [13]:
# !python -m spacy download en_core_web_sm


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 5.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.6.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [14]:
import spacy

# Load the spaCy model for NER (you can choose other models depending on your language)
nlp = spacy.load("en_core_web_sm")

# Sample text for NER
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California on April 1, 1976."

# Process the text with spaCy's NER model
doc = nlp(text)

# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Print the named entities and their labels
for entity, label in entities:
    print(f"Entity: {entity}, Label: {label}")


Entity: Apple Inc., Label: ORG
Entity: Steve Jobs, Label: PERSON
Entity: Cupertino, Label: GPE
Entity: California, Label: GPE
Entity: April 1, 1976, Label: DATE


# EMOTION DETECTION IN TEXT

In [15]:
pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
     ---------------------------------------- 7.6/7.6 MB 5.5 MB/s eta 0:00:00
Collecting safetensors>=0.3.1
  Downloading safetensors-0.3.3-cp39-cp39-win_amd64.whl (266 kB)
     -------------------------------------- 266.4/266.4 kB 8.3 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.15.1
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
     ------------------------------------- 295.0/295.0 kB 17.8 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-win_amd64.whl (3.5 MB)
     ---------------------------------------- 3.5/3.5 MB 6.9 MB/s eta 0:00:00
Installing collected packages: tokenizers, safetensors, huggingface-hub, transformers
Successfully installed huggingface-hub-0.17.3 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.3
Note: you may need to rest



In [17]:
pip install torch


Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.0.1-cp39-cp39-win_amd64.whl (172.4 MB)
     -------------------------------------- 172.4/172.4 MB 2.8 MB/s eta 0:00:00
Installing collected packages: torch
Successfully installed torch-2.0.1
Note: you may need to restart the kernel to use updated packages.




In [18]:
pip install torch torchvision torchaudio


Defaulting to user installation because normal site-packages is not writeable
Collecting torchvision
  Downloading torchvision-0.15.2-cp39-cp39-win_amd64.whl (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 2.4 MB/s eta 0:00:00
Collecting torchaudio
  Downloading torchaudio-2.0.2-cp39-cp39-win_amd64.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 3.3 MB/s eta 0:00:00
Installing collected packages: torchvision, torchaudio
Successfully installed torchaudio-2.0.2 torchvision-0.15.2
Note: you may need to restart the kernel to use updated packages.


In [19]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.nn.functional import softmax

# Load the pre-trained DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

# Define the emotion labels
emotions = ["joy", "anger", "sadness", "surprise", "fear", "disgust"]

# Input text
text = "I'm so happy to see you!"

# Tokenize and encode the text
input_ids = tokenizer.encode(text, truncation=True, padding=True, return_tensors="pt")

# Perform inference and get emotion probabilities
with torch.no_grad():
    outputs = model(input_ids)
    probs = softmax(outputs.logits, dim=1)[0]

# Map probabilities to emotions
emotion_predictions = {emotion: prob.item() for emotion, prob in zip(emotions, probs)}

# Print the predicted emotions and their probabilities
for emotion, prob in emotion_predictions.items():
    print(f"{emotion.capitalize()}: {prob:.2f}")


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Joy: 0.55
Anger: 0.45
