# Sentiment polarity
Techniques like TF-IDF and word embeddings help capture semantic relationships and represent text numerically.

In [112]:
# Import necessary libraries
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import os
os.chdir("C:/Sereda/Job/portfolio/Python/Sentiment")
import pandas as pd
import numpy as np
import string # string.punctuation

In [113]:
# Load customer review data
reviews = pd.read_csv('customer_reviews.csv')

  reviews = pd.read_csv('customer_reviews.csv')


In [115]:
reviews = reviews.loc[:,['reviews.text','reviews.doRecommend']] # Review text and sentiment
reviews.shape

(34660, 2)

In [116]:
reviews.head(2)

Unnamed: 0,reviews.text,reviews.doRecommend
0,This product so far has not disappointed. My c...,True
1,great for beginner or experienced person. Boug...,True


This is a list of over 34,000 consumer reviews for Amazon products like the Kindle, Fire TV Stick, and more provided by Datafiniti's Product Database. The dataset includes basic product information, rating, review text, and more for each product.

In [117]:
reviews.isnull().sum()

reviews.text             1
reviews.doRecommend    594
dtype: int64

In [118]:
reviews = reviews.loc[~pd.isnull(reviews['reviews.doRecommend'])] # remove NaN
reviews.shape

(34066, 2)

In [119]:
reviews.isnull().sum()

reviews.text           0
reviews.doRecommend    0
dtype: int64

In [120]:
reviews['reviews.doRecommend'] = reviews['reviews.doRecommend'].astype('int') # {True, False} -> {1, 0}

In [None]:
#nltk.download('punkt')
#nltk.download('stopwords')

In [121]:
# **Data Preprocessing**
def preprocess_text(text):
    if(type(text) == str):
        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        # Tokenize the text
        tokens = nltk.word_tokenize(text)
        # Remove stop words
        sw = nltk.corpus.stopwords.words('english') # stop words
        for w in ['no','not']: sw.remove(w) # remove 'no' and 'not' from the list of stop words
        tokens = [token for token in tokens if token not in sw]
        # Stem the tokens
        stemmer = nltk.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        # Join the tokens back into a string
        return ' '.join(tokens)
    else:
        return("absent")

In [122]:
# Apply preprocessing to review text
reviews['preprocessed_text'] = reviews['reviews.text'].apply(preprocess_text)
reviews['preprocessed_text'].loc[len(reviews)-1]

'found fire tv great buy love take everywher note prime tv not work countri oversea netflix still find lot thing watch road rent movi amazon paid no prime unfortun us no problem highli recommend'

In [123]:
# **Feature Engineering**
# Extract TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(reviews['preprocessed_text'])

## Model Training

In [124]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, reviews['reviews.doRecommend'], test_size=0.2)
y_train.unique() # must have both [True, False]

array([1, 0])

In [125]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')
y_train.unique()

array([1, 0])

In [126]:
# Train a logistic regression classifier
model = LogisticRegression(solver="liblinear") #solver={'lbfgs', "liblinear", "sag", "saga"}
model.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [127]:
# **Model Evaluation**
# Predict sentiment on test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9619900205459349


## Predictions

In [128]:
# Apply preprocessing to review text
txt = "I do not like this item and I do not recommend it!"
txt_preproc = preprocess_text(txt)
txt_preproc

'not like item not recommend'

In [129]:
txt_vect = vectorizer.transform([txt_preproc])
txt_vect

<1x13448 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [130]:
model.predict(txt_vect) # 1 - recommend

array([0])

In [131]:
# Check if there are any predictions of 0
(model.predict(X_test)==0).sum()

56

## Second Variant

In [97]:
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [132]:
df = reviews

In [134]:
# Preprocess text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = ''.join([char for char in text if not char.isdigit() and not char in string.punctuation])
    # Tokenize words
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    for w in ['no','not']: stopwords.remove(w) # remove 'no' and 'not' from the list of stop words
    tokens = [token for token in tokens if token not in stopwords]
    # Lemmatize words
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Apply text preprocessing to review column
df['preprocessed_text'] = df['reviews.text'].apply(preprocess_text)

# Extract relevant features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['preprocessed_text'])

# Split data into train and test sets
y = df['reviews.doRecommend']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a machine learning classifier
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate model performance
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9614029938362195


In [135]:
# Apply preprocessing to review text
txt = "I do not like this item and I do not recommend it!"
txt_preproc = preprocess_text(txt)
txt_preproc

'not like item not recommend'

In [136]:
txt_vect = vectorizer.transform([txt_preproc])
txt_vect

<1x15537 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [137]:
model.predict(txt_vect) # 1 - recommend

array([0])