In [3]:
import pandas as pd

data= { 'Text':[
    'I love this product, it is amazing!',
    'This the worst purchase I have ever made.',
    'It was an okay experience, not too bad.',
    'Absoulately fantastic! I will buy again.',
    'Terrible, the qulality is awful, very disappointed.',
]}

df=pd.DataFrame(data)
df

Unnamed: 0,Text
0,"I love this product, it is amazing!"
1,This the worst purchase I have ever made.
2,"It was an okay experience, not too bad."
3,Absoulately fantastic! I will buy again.
4,"Terrible, the qulality is awful, very disappoi..."


    Text Preprocessing
    Text preprocessing is the first and crucial step. It involves cleaning the text and preparing it for further processing. Common preprocessing tasks include:

    Lowercasing: Convert all the text to lowercase to standardize.

    Remove punctuation, special characters, and numbers: Clean the text by removing unnecessary characters.

    Remove extra whitespaces: Strip leading/trailing spaces and reduce multiple spaces between words.

In [None]:
import string

# Text Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])
    text = ' '.join(text.split())  # Remove extra spaces
    return text

df['Processed_Text'] = df['Text'].apply(preprocess_text)
print()
df[['Text', 'Processed_Text']]




Unnamed: 0,Text,Processed_Text
0,"I Love this product, it is amazing!",i love this product it is amazing
1,This is the worst purchase I have ever made.,this is the worst purchase i have ever made
2,"It was an okay experience, not too bad.",it was an okay experience not too bad
3,Absolutely fantastic! I will buy again.,absolutely fantastic i will buy again
4,"Terrible, the quality is awful, very disappoin...",terrible the quality is awful very disappointed


    Tokenization
    Tokenization involves splitting text into individual words or tokens. It is a crucial step as the model works with individual tokens rather than raw text.

In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

#Tokenization
df['Tokens']= df['Processed_Text'].apply(word_tokenize)
print()
df[['Processed_Text','Tokens']]




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Processed_Text,Tokens
0,i love this product it is amazing,"[i, love, this, product, it, is, amazing]"
1,this is the worst purchase i have ever made,"[this, is, the, worst, purchase, i, have, ever..."
2,it was an okay experience not too bad,"[it, was, an, okay, experience, not, too, bad]"
3,absolutely fantastic i will buy again,"[absolutely, fantastic, i, will, buy, again]"
4,terrible the quality is awful very disappointed,"[terrible, the, quality, is, awful, very, disa..."


In [None]:
Stopword Removal
Stopwords are common words that don't carry significant meaning and can be safely removed from the text. Common stopwords include 'the', 'is', 'in', 'a', etc

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Stopword removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['Tokens_No_Stopwords'] = df['Tokens'].apply(remove_stopwords)
#print(df[['Tokens', 'Tokens_No_Stopwords']])
df[['Tokens', 'Tokens_No_Stopwords']]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Tokens,Tokens_No_Stopwords
0,"[i, love, this, product, it, is, amazing]","[love, product, amazing]"
1,"[this, is, the, worst, purchase, i, have, ever...","[worst, purchase, ever, made]"
2,"[it, was, an, okay, experience, not, too, bad]","[okay, experience, bad]"
3,"[absolutely, fantastic, i, will, buy, again]","[absolutely, fantastic, buy]"
4,"[terrible, the, quality, is, awful, very, disa...","[terrible, quality, awful, disappointed]"


In [None]:
Stemming/Lemmatization
Stemming and lemmatization reduce words to their root form. Stemming is a more aggressive process, while lemmatization considers the context and produces meaningful base forms of words.

In [None]:
from nltk.stem import PorterStemmer

# Stemming
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

df['Stemmed_Tokens'] = df['Tokens_No_Stopwords'].apply(stem_tokens)
#print(df[['Tokens_No_Stopwords', 'Stemmed_Tokens']])
df[['Tokens_No_Stopwords', 'Stemmed_Tokens']]

Unnamed: 0,Tokens_No_Stopwords,Stemmed_Tokens
0,"[love, product, amazing]","[love, product, amaz]"
1,"[worst, purchase, ever, made]","[worst, purchas, ever, made]"
2,"[okay, experience, bad]","[okay, experi, bad]"
3,"[absolutely, fantastic, buy]","[absolut, fantast, buy]"
4,"[terrible, quality, awful, disappointed]","[terribl, qualiti, aw, disappoint]"


In [None]:
Bag of Words (BOW)
The Bag of Words (BOW) model represents text data as a matrix of token counts. Each unique token from the corpus is a feature.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# BOW Model
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['Processed_Text'])
#print(pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out()))
df_final = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
df_final

Unnamed: 0,absolutely,again,amazing,an,awful,bad,buy,disappointed,ever,experience,...,purchase,quality,terrible,the,this,too,very,was,will,worst
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,1,0,0,1,1,0,0,0,0,1
2,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0
3,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,1,0,0,...,0,1,1,1,0,0,1,0,0,0


In [None]:
TF-IDF (Term Frequency - Inverse Document Frequency)
TF-IDF is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. It assigns higher weights to words that are frequent in a specific document but rare across other documents.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Model
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['Processed_Text'])
#print(pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out()))
tf_data = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tf_data

Unnamed: 0,absolutely,again,amazing,an,awful,bad,buy,disappointed,ever,experience,...,purchase,quality,terrible,the,this,too,very,was,will,worst
0,0.0,0.0,0.458815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.370169,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38489,0.0,...,0.38489,0.0,0.0,0.310527,0.310527,0.0,0.0,0.0,0.0,0.38489
2,0.0,0.0,0.0,0.361529,0.0,0.361529,0.0,0.0,0.0,0.361529,...,0.0,0.0,0.0,0.0,0.0,0.361529,0.0,0.361529,0.0,0.0
3,0.447214,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0
4,0.0,0.0,0.0,0.0,0.404907,0.0,0.0,0.404907,0.0,0.0,...,0.0,0.404907,0.404907,0.326676,0.0,0.0,0.404907,0.0,0.0,0.0


In [None]:
tf_data.reset_index()

Unnamed: 0,index,absolutely,again,amazing,an,awful,bad,buy,disappointed,ever,...,purchase,quality,terrible,the,this,too,very,was,will,worst
0,0,0.0,0.0,0.458815,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.370169,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38489,...,0.38489,0.0,0.0,0.310527,0.310527,0.0,0.0,0.0,0.0,0.38489
2,2,0.0,0.0,0.0,0.361529,0.0,0.361529,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.361529,0.0,0.361529,0.0,0.0
3,3,0.447214,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0
4,4,0.0,0.0,0.0,0.0,0.404907,0.0,0.0,0.404907,0.0,...,0.0,0.404907,0.404907,0.326676,0.0,0.0,0.404907,0.0,0.0,0.0


In [None]:
tf_data

Unnamed: 0,absolutely,again,amazing,an,awful,bad,buy,disappointed,ever,experience,...,purchase,quality,terrible,the,this,too,very,was,will,worst
0,0.0,0.0,0.458815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.370169,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38489,0.0,...,0.38489,0.0,0.0,0.310527,0.310527,0.0,0.0,0.0,0.0,0.38489
2,0.0,0.0,0.0,0.361529,0.0,0.361529,0.0,0.0,0.0,0.361529,...,0.0,0.0,0.0,0.0,0.0,0.361529,0.0,0.361529,0.0,0.0
3,0.447214,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0
4,0.0,0.0,0.0,0.0,0.404907,0.0,0.0,0.404907,0.0,0.0,...,0.0,0.404907,0.404907,0.326676,0.0,0.0,0.404907,0.0,0.0,0.0


In [None]:
X_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34 stored elements and shape (5, 29)>

In [None]:
Skipgram (Word2Vec)
The Skipgram model is used to train word embeddings, where a word is represented in vector form. If you're working on word embeddings, the Skipgram model from Word2Vec can help. This is more advanced and can be done using libraries like gensim.

In [None]:
!pip install gensim



In [None]:
# pip install gensim

In [None]:
from gensim.models import Word2Vec
#Training skipgram models(Word2Vec)
model=Word2Vec(df['Tokens_No_Stopwords'],vector_size=50,window=3,min_count=1,sg=1)
model.save("skipgram.model")

#view vector for the word "product"
print(len(model.wv['product']))



50


In [None]:
print(model.wv['product'])

[-0.0173568  -0.00289396  0.01895893 -0.01509898 -0.0107162   0.01863313
 -0.01794745  0.00765182  0.00133088  0.0133214   0.01662551 -0.00570157
 -0.00798463  0.01779583  0.00417929  0.01249788 -0.01889143  0.01918025
 -0.00269662 -0.01210423  0.00598507 -0.00091322  0.00941299 -0.00456604
 -0.00827568  0.0045558   0.01670877 -0.00999121  0.00533736 -0.01598111
 -0.01354669 -0.00093534 -0.01753546  0.00557888  0.00319719 -0.00463938
  0.01000758  0.01949757  0.01690854 -0.00376045  0.0041163  -0.00800738
 -0.01648281  0.01255591 -0.00389836 -0.00133241 -0.00354266 -0.00907133
  0.00812342 -0.00854036]


# This step is not mandatory for all sentiment classification tasks, but it helps capture deeper semantic meaning in the words.



In [None]:
Hugging Face Transformers (e.g., BERT, RoBERTa)
Hugging Face provides a variety of pre-trained models for sentiment analysis, including BERT, DistilBERT, RoBERTa, and more. These models are very powerful for NLP tasks and can be used to classify sentiment with high accuracy

In [None]:
#!pip install transformers

In [None]:
from transformers import pipeline

# Load a pre-trained sentiment analysis model
sentiment_model = pipeline('sentiment-analysis')

# Provide new text data for sentiment analysis
text = "I love this product! It's amazing."

# Get the sentiment label (positive/negative) and score
result = sentiment_model(text)

print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998866319656372}]


In [None]:
# Provide new text data for sentiment analysis
text = "I hate this product!."

# Get the sentiment label (positive/negative) and score
result = sentiment_model(text)

print(result)

[{'label': 'NEGATIVE', 'score': 0.999757707118988}]


In [None]:
# Provide new text data for sentiment analysis
text = "It is having average quality"

# Get the sentiment label (positive/negative) and score
result = sentiment_model(text)

print(result)

[{'label': 'POSITIVE', 'score': 0.9835952520370483}]


In [None]:
VADER (Valence Aware Dictionary and sEntiment Reasoner)
VADER is a simple and fast tool for sentiment analysis. It works well on social media texts, short sentences, and is pre-trained to handle sentiment classification.

In [None]:
!pip install vaderSentiment



In [None]:

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Provide some text for analysis
text = "I absolutely hate waiting in long lines!"

# Get the sentiment scores
scores = analyzer.polarity_scores(text)

# Classify sentiment as positive, negative, or neutral
if scores['compound'] >= 0.05:
    sentiment = 'positive'
elif scores['compound'] <= -0.05:
    sentiment = 'negative'
else:
    sentiment = 'neutral'

print(f"Sentiment: {sentiment}")

Sentiment: negative


In [None]:
scores

{'neg': 0.417, 'neu': 0.583, 'pos': 0.0, 'compound': -0.6468}

In [None]:
# Provide some text for analysis
text = "I absolutely love waiting in long lines!"

# Get the sentiment scores
scores = analyzer.polarity_scores(text)

# Classify sentiment as positive, negative, or neutral
if scores['compound'] >= 0.05:
    sentiment = 'positive'
elif scores['compound'] <= -0.05:
    sentiment = 'negative'
else:
    sentiment = 'neutral'

print(f"Sentiment: {sentiment}")


Sentiment: positive


In [None]:
scores

{'neg': 0.0, 'neu': 0.556, 'pos': 0.444, 'compound': 0.6989}

In [None]:
TextBlob
TextBlob is another popular NLP library with built-in sentiment analysis. It uses a simple rule-based approach to assign a sentiment polarity score and subjectivity score.

In [None]:
#!pip install textblob

In [None]:
from textblob import TextBlob

# Provide the text for analysis
text = "I like my new phone!"

# Create a TextBlob object
blob = TextBlob(text)

# Get the polarity (sentiment score)
polarity = blob.sentiment.polarity

# Classify sentiment
if polarity > 0:
    sentiment = 'positive'
elif polarity < 0:
    sentiment = 'negative'
else:
    sentiment = 'neutral'

print(f"Sentiment: {sentiment}")

Sentiment: positive


In [None]:
blob

TextBlob("I like my new phone!")

In [None]:
polarity

0.17045454545454544

In [None]:
Transformers + Hugging Face (for multi-class classification: Positive, Negative, Neutral)
For more detailed classification (like positive, negative, and neutral), you can also use models fine-tuned specifically for multi-class sentiment classification

In [None]:
from transformers import pipeline

# Load a pre-trained sentiment analysis model
sentiment_model = pipeline('text-classification', model='j-hartmann/emotion-english-distilroberta-base')

# Provide text for analysis
text = "I'm feeling so bad today!"

# Get the result
result = sentiment_model(text)

print(result)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'sadness', 'score': 0.978950560092926}]


In [None]:
# Provide text for analysis
text = "i will beat you up"

# Get the result
result = sentiment_model(text)

print(result)

[{'label': 'joy', 'score': 0.686745822429657}]


In [None]:
To achieve your goal of providing new data and getting sentiment classification (positive, negative, neutral), you can wrap any of the pre-trained models mentioned above into a function.

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

def classify_sentiment(text):
    # Get the sentiment scores
    scores = analyzer.polarity_scores(text)

    # Classify sentiment as positive, negative, or neutral
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Example usage with new data
new_text = "I'm having a great day!"
sentiment = classify_sentiment(new_text)
print(f"Sentiment: {sentiment}")

Sentiment: positive


In [None]:
Sentiment Classification

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Sample DataFrame
data = {
    'Text': [
        'I love this product, it is amazing!',
        'This is the worst purchase I have ever made.',
        'It was an okay experience, not too bad.',
        'Absolutely fantastic! I will buy again.',
        'Terrible, the quality is awful, very disappointed.'
    ],
    'Sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Text,Sentiment
0,"I love this product, it is amazing!",positive
1,This is the worst purchase I have ever made.,negative
2,"It was an okay experience, not too bad.",neutral
3,Absolutely fantastic! I will buy again.,positive
4,"Terrible, the quality is awful, very disappoin...",negative


In [None]:
# TF-IDF Transformation
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['Text'])
X_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34 stored elements and shape (5, 29)>

In [None]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26 stored elements and shape (4, 29)>

In [None]:
X_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (1, 29)>

In [None]:
y_train

Unnamed: 0,Sentiment
4,negative
2,neutral
0,positive
3,positive


In [None]:
y_test

Unnamed: 0,Sentiment
1,negative


In [None]:
# Train a logistic Regression model
model = LogisticRegression()
model.fit(X_train,y_train)

#Evaluate the model on the test data
y_pred=model.predict(X_test)
print(classification_report(y_test,y_pred))

#Classify new text
new_text=["I absoulately love this! It's wonderful"]
new_text_tfidf= tfidf_vectorizer.transform(new_text)
sentiment= model.predict(new_text_tfidf)

print(f"The sentiment of the new text is: {sentiment[0]}")

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       1.0
    positive       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

The sentiment of the new text is: positive


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
