In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

# Load the dataset
data = pd.read_csv('amazon.csv')

# Drop rows with missing values
data.dropna(inplace=True)

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if not word in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]  # Lemmatize
    return ' '.join(lemmatized_tokens)

# Apply text preprocessing
data['preprocessed'] = data['review_content'].apply(preprocess_text)

# POS tagging and extracting adjectives and adverbs
def extract_adj_adv(text):
    tokens = word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    return ' '.join([word for word, tag in tagged if tag in ['JJ', 'RB']])

data['adj_adv_only'] = data['preprocessed'].apply(extract_adj_adv)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [2]:
sia = SentimentIntensityAnalyzer()

def compute_sentiment_score(text):
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores['compound']  # Return the compound score

data['sentiment_score'] = data['adj_adv_only'].apply(compute_sentiment_score)


In [3]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['adj_adv_only'])

# Combine TF-IDF scores with sentiment scores
tfidf_scores = tfidf_matrix.toarray()
sentiment_scores = data['sentiment_score'].values

# Calculate the product of TF-IDF scores and sentiment scores for each word in each review
combined_scores = np.multiply(tfidf_scores, sentiment_scores[:, None])

# Store terms and their combined scores in a DataFrame
feature_names = tfidf_vectorizer.get_feature_names_out()
combined_scores_mean = np.mean(combined_scores, axis=0)
combined_df = pd.DataFrame({'Term': feature_names, 'Combined Score': combined_scores_mean})

# Sort the DataFrame by combined scores
combined_df_sorted = combined_df.sort_values(by='Combined Score', ascending=False)

# Print the top 10 terms
print(combined_df_sorted.head(10))


        Term  Combined Score
2280    good        0.112611
185     also        0.036845
1566    easy        0.035717
6952    well        0.030486
2377   great        0.026684
5011  really        0.026653
3864    nice        0.025657
1900    fast        0.025623
1738    even        0.022593
1980    fine        0.021359
