In [10]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
import pandas as pd
import string
from textblob import TextBlob
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Download spaCy model
!python -m spacy download en_core_web_sm

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Example paragraph
paragraph = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. 
The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful. 
Sentiment analysis is a common application of NLP that aims to determine the attitude or emotion expressed in a piece of text.
"""

# 1. Tokenization
tokens = word_tokenize(paragraph)
print("Tokens:", tokens)

# 2. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print("\nLemmatized Tokens:", lemmatized_tokens)

# 3. Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print("\nStemmed Tokens:", stemmed_tokens)

# 4. Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print("\nFiltered Tokens:", filtered_tokens)

# 5. Bag of Words (BoW)
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform([paragraph])
print("\nBag of Words Matrix:\n", bow_matrix.toarray())
print("BoW Feature Names:", vectorizer.get_feature_names_out())

# 6. TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([paragraph])
print("\nTF-IDF Matrix:\n", tfidf_matrix.toarray())
print("TF-IDF Feature Names:", tfidf_vectorizer.get_feature_names_out())

# 7. Sentiment Lexicon (Using a simple dictionary for illustration)
sentiment_lexicon = {
    "natural": 0,
    "language": 0,
    "processing": 0,
    "artificial": 0,
    "intelligence": 0,
    "focuses": 0,
    "interaction": 0,
    "between": 0,
    "computers": 0,
    "humans": 0,
    "through": 0,
    "goal": 0,
    "enable": 0,
    "understand": 0,
    "interpret": 0,
    "generate": 0,
    "human": 0,
    "way": 0,
    "meaningful": 1,
    "useful": 1,
    "sentiment": 1,
    "analysis": 0,
    "application": 0,
    "aims": 0,
    "determine": 0,
    "attitude": 0,
    "emotion": 1,
    "expressed": 0,
    "piece": 0,
    "text": 0
}
sentiment_score = sum([sentiment_lexicon.get(word.lower(), 0) for word in tokens])
print("\nSentiment Score:", sentiment_score)

# 8. Part of Speech (POS) Tagging
pos_tags = nltk.pos_tag(tokens)
print("\nPOS Tags:", pos_tags)

# 9. Named Entity Recognition (NER)
doc = nlp(paragraph)
entities = [(entity.text, entity.label_) for entity in doc.ents]
print("\nNamed Entities:", entities)

# 10. Word Embeddings (Using spaCy's embeddings)
embeddings = {token.text: token.vector for token in doc}
print("\nWord Embeddings for 'NLP':", embeddings['NLP'])

# 11. Sentiment Polarity (Using TextBlob for illustration)
blob = TextBlob(paragraph)
polarity = blob.sentiment.polarity
print("\nSentiment Polarity:", polarity)

# 12. Subjectivity/Objectivity
subjectivity = blob.sentiment.subjectivity
print("\nSubjectivity:", subjectivity)

# 13. n-grams (Bigrams for illustration)
bigrams = list(nltk.bigrams(tokens))
print("\nBigrams:", bigrams)

# 14. Feature Extraction (Using CountVectorizer for illustration)
vectorizer = CountVectorizer()
features = vectorizer.fit_transform([paragraph])
print("\nFeature Extraction Matrix:\n", features.toarray())
print("Feature Names:", vectorizer.get_feature_names_out())

# 15. Multinomial Logistic Regression for Sentiment Classification
# For illustration, we create a simple dataset
texts = [
    "I love this product, it is amazing and wonderful!",
    "I hate this, it is the worst thing ever.",
    "This is a book about NLP and its applications.",
    "The weather today is sunny and beautiful.",
    "The project is a huge failure and disappointment.",
    "The movie was fantastic and full of excitement.",
    "I do not like the food, it is terrible and bland.",
    "The seminar was very informative and well-organized."
]
labels = ["positive", "negative", "objective", "positive", "negative", "positive", "negative", "positive"]

# Convert texts to TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(texts)

# Encode labels to numerical values
label_mapping = {"positive": 1, "negative": -1, "objective": 0}
y = [label_mapping[label] for label in labels]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Multinomial Logistic Regression model
classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs')
classifier.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = classifier.predict(X_test)

# Determine the unique classes in y_test to adjust target_names dynamically
unique_classes = sorted(set(y_test))
target_names = [list(label_mapping.keys())[list(label_mapping.values()).index(cls)] for cls in unique_classes]

print("\nClassification Report:\n", metrics.classification_report(y_test, y_pred, target_names=target_names))

# Predict the sentiment of the example paragraph
paragraph_tfidf = tfidf_vectorizer.transform([paragraph])
paragraph_prediction = classifier.predict(paragraph_tfidf)
reverse_label_mapping = {1: "positive", -1: "negative", 0: "objective"}
print("\nSentiment Prediction for the Paragraph:", reverse_label_mapping[paragraph_prediction[0]])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wacca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wacca\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wacca\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wacca\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.1/12.8 MB 233.8 kB/s eta 0:00:55
     --------------------------------------- 0.1/12.8 MB 308.0 kB/s eta 0:00:42
      -------------------------------------- 0.2/12.8 MB 625.1 kB/s eta 0:00:21
     - -------------------------------------- 0.5/12.8 MB 1.2 MB/s eta 0:00:11
     -- ------------------------------------- 0.8/1

Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', '.', 'The', 'ultimate', 'goal', 'of', 'NLP', 'is', 'to', 'enable', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', 'in', 'a', 'way', 'that', 'is', 'both', 'meaningful', 'and', 'useful', '.', 'Sentiment', 'analysis', 'is', 'a', 'common', 'application', 'of', 'NLP', 'that', 'aims', 'to', 'determine', 'the', 'attitude', 'or', 'emotion', 'expressed', 'in', 'a', 'piece', 'of', 'text', '.']

Lemmatized Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'focus', 'on', 'the', 'interaction', 'between', 'computer', 'and', 'human', 'through', 'natural', 'language', '.', 'The', 'ultimate', 'goal', 'of', 'NLP', 'is', 'to', 'enable', 'comput

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
!python -m spacy download en_core_web_sm
!pip install nltk spacy scikit-learn textblob


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.7/12.8 MB 23.8 MB/s eta 0:00:01
     ----- ---------------------------------- 1.7/12.8 MB 15.9 MB/s eta 0:00:01
     --------- ------------------------------ 2.9/12.8 MB 16.8 MB/s eta 0:00:01
     ------------ --------------------------- 3.9/12.8 MB 16.4 MB/s eta 0:00:01
     -------------- ------------------------- 4.6/12.8 MB 16.5 MB/s eta 0:00:01
     ----------------- ---------------------- 5.5/12.8 MB 16.7 MB/s eta 0:00:01
     -------------------- ------------------- 6.5/12.8 MB 16.6 MB/s eta 0:00:01
     ---------------------- ----------------- 7.3/12.8 MB 16.6 MB/s eta 0:00:01
     ------------------------- ---

Defaulting to user installation because normal site-packages is not writeable
Collecting textblob
  Obtaining dependency information for textblob from https://files.pythonhosted.org/packages/02/07/5fd2945356dd839974d3a25de8a142dc37293c21315729a41e775b5f3569/textblob-0.18.0.post0-py3-none-any.whl.metadata
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   --------------------------------------  624.6/626.3 kB 13.4 MB/s eta 0:00:01
   ---------------------------------------- 626.3/626.3 kB 7.9 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0
