In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize



In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read in data
#df = pd.read_csv('../../datasets/amazon-fine-food-reviews/Reviews.csv')
df = pd.read_csv('/content/drive/MyDrive/datasets/amazon-fine-food-reviews/Reviews.csv')
print(df.shape)


In [None]:
df.head()

Distribution of Scores

In [None]:
plt.figure(figsize=(4,3))

ax = df['Score'].value_counts().sort_index() \
    .plot(kind='bar',
          title='Count of Reviews by Stars')
ax.set_xlabel('Review Stars')
plt.show()

Look at an example text

In [None]:
example = df['Text'][50]
print(example)

Distribution of text length

In [None]:
lengths = [len(text) for text in df.Text]

In [None]:
plt.hist(lengths, bins=100);

In [None]:
lengths = [item for item in lengths if item < 3000]

In [None]:
plt.hist(lengths, bins=100);

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:10]

### Sentiment Analysis with NLTK Vader

NLTK Vader is a simple rule based sentiment analysis model. More information can be found at:

Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
Sentiment Analysis of Social Media Text. Eighth International Conference on
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am so happy!')

In [None]:
sia.polarity_scores('This is the worst thing ever.')

In [None]:
sia.polarity_scores('This is not a good movie.')

In [None]:
sia.polarity_scores(example)

In [None]:
from collections import Counter
#df_small = df.sample(n=10000)
df_small = df[:10000].copy()
df_small['Score'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)
Counter(df_small.Score)

In [None]:
# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(df_small.iterrows(), total=len(df_small)):
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df_small, how='left')

In [None]:
# Now we have sentiment score and metadata
vaders.head()

In [None]:
print(np.max(vaders.compound))
print(np.min(vaders.compound))

In [None]:
print(np.max(vaders.Score))
print(np.min(vaders.Score))

In [None]:
def scale_score(x):
    return (x+1)/2

scaled_scores = [scale_score(x) for x in vaders.compound]
print(scaled_scores)


In [None]:
print("MAE:", mean_absolute_error(vaders.Score, scaled_scores))
print("MSE:", mean_squared_error(vaders.Score, scaled_scores))

For comparison, following shows the performance of random guess

In [None]:
random_scores = np.random.uniform(0, 1, len(vaders))
print("MAE random:", mean_absolute_error(vaders.Score,random_scores))
print("MSE random:", mean_squared_error(vaders.Score, random_scores))

random_scores = np.random.randint(0, 2, len(vaders))
print(classification_report(vaders.Score,random_scores))
print(confusion_matrix(vaders.Score,random_scores))



In [None]:
Counter(vaders.Score)

### Sentiment Analysis with machine learning

In [None]:
#df_small = df.sample(n=10000)
df_small = df[:10000].copy()
df_small['Score'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)
Counter(df_small.Score)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=100, max_df=0.1)
X = tfidf_vectorizer.fit_transform(df_small['Text'])
y = df_small['Score'].values

In [None]:
tfidf_vectorizer.get_feature_names_out()

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

model1 = LGBMClassifier()
model2 = LGBMRegressor()
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

# Predict and evaluate the classifier
predictions1 = model1.predict(X_test)
predictions2 = model2.predict(X_test)

print("MAE classifier:", mean_absolute_error(y_test, predictions1))
print("MSE classifier:", mean_squared_error(y_test, predictions1))

print(classification_report(y_test, predictions1))
print(confusion_matrix(y_test, predictions1))

print("MAE regressor:", mean_absolute_error(y_test, predictions2))
print("MSE regressor:", mean_squared_error(y_test, predictions2))

In [None]:
Counter(y_test)

In [None]:
np.random.randint(-1,2)

Results are better than the rule based sentiment analysis. Note that, we did not use the entire dataset, more data will improve the performance of the ML classifiers. Aslo note that this is a simple TFIDF+ML implementation, there is room for improvement. Another thing to note is that modeling the problem as a classification problem (as opposed to a regression problem) gave better results in terms of MAE but worse results in terms of MSE. This is reasonable since for the classifier predicting a wrong class amounts to the same error regardless of the actaul value, so the classifier optimizes for predicting the correct class rather than predicting a class which is numerically closer. On the other hand the regressor tries to minimize the distance between the actual value and the prediction. Hence we get lower MAE but higher MSE using a classifier.

### Word Embeddings

In [None]:
import sys
!{sys.executable} -m pip install gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader
from nltk.tokenize import RegexpTokenizer

In [None]:
embeddings = gensim.downloader.load('glove-twitter-25')

In [None]:
import re

def simple_preprocess(text):
    text = text.lower()  # convert text to lower-case
    text = re.sub(r"@\w+", "", text)  # remove mentions
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text, re.I|re.A)  # remove non-letters
    text = re.sub(r"\s+", " ", text).strip()  # remove excess whitespace
    return text

In [None]:
def text_to_vector(text, embeddings):
    tokenizer = RegexpTokenizer(r'\w+')

    tokens = simple_preprocess(text)

    vectors = []

    for token in tokenizer.tokenize(tokens):
        # Check if the token is in the embeddings vocabulary
        if token in embeddings:
            vectors.append(embeddings[token])

    # If we found any vectors, calculate the mean, otherwise return a zero vector
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embeddings.vector_size)



In [None]:
# Convert texts to vectors
vectorized_texts = df_small['Text'].apply(lambda x: text_to_vector(x, embeddings))

# Convert to numpy array
X = np.array(vectorized_texts.tolist())
y = df_small['Score'].values


In [None]:
print(X.shape)
X

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

model1 = LGBMClassifier()
model2 = LGBMRegressor()
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

# Predict and evaluate the classifier
predictions1 = model1.predict(X_test)
predictions2 = model2.predict(X_test)

print("MAE classifier:", mean_absolute_error(y_test, predictions1))
print("MSE classifier:", mean_squared_error(y_test, predictions1))

print(classification_report(y_test, predictions1))
print(confusion_matrix(y_test, predictions1))

print("MAE regressor:", mean_absolute_error(y_test, predictions2))
print("MSE regressor:", mean_squared_error(y_test, predictions2))


### Doc2Vec


In [None]:
texts = df_small['Text'].tolist()
tokenized_texts = [word_tokenize(text.lower()) for text in texts]

In [None]:
tokenized_texts[0]

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_texts)]

In [None]:
documents[:5]

In [None]:
model = Doc2Vec(documents, epochs=30)

In [None]:
# Convert texts to vectors
vectorized_texts = df_small['Text'].apply(lambda x: model.infer_vector(word_tokenize(x.lower())))

# Convert to numpy array
X = np.array(vectorized_texts.tolist())
y = df_small['Score'].values

In [None]:
X

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

model1 = LGBMClassifier()
model2 = LGBMRegressor()
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

# Predict and evaluate the classifier
predictions1 = model1.predict(X_test)
predictions2 = model2.predict(X_test)

print("MAE classifier:", mean_absolute_error(y_test, predictions1))
print("MSE classifier:", mean_squared_error(y_test, predictions1))

print(classification_report(y_test, predictions1))
print(confusion_matrix(y_test, predictions1))

print("MAE regressor:", mean_absolute_error(y_test, predictions2))
print("MSE regressor:", mean_squared_error(y_test, predictions2))