In [1]:
import pandas as pd 
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
from pprint import pprint
from tqdm import tqdm

# Use tqdm to track progress
tqdm.pandas()

# Ensure necessary NLTK data packages are downloaded
nltk.download('punkt')
nltk.download('vader_lexicon')

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')

# Load data
df = pd.read_csv('data/Cleaned_Sports_and_Outdoors.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/wlodzimierrr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/wlodzimierrr/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Feature extraction functions
def word_count(text):
    return len(word_tokenize(text))

def sentence_length(text):
    sentences = sent_tokenize(text)
    return sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences) if sentences else 0

def sentiment_score(text):
    return analyzer.polarity_scores(text)['compound']

def extract_key_phrases(text):
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks]

def review_length(text):
    return len(text)

def normalize_helpful_votes(helpful_vote, max_votes=10):
    return helpful_vote / max_votes if max_votes > 0 else 0

In [3]:
# Apply feature extraction
df['word_count'] = df['text'].progress_apply(word_count)
df['avg_sentence_length'] = df['text'].progress_apply(sentence_length)
df['sentiment_score'] = df['text'].progress_apply(sentiment_score)
df['key_phrases'] = df['text'].progress_apply(extract_key_phrases)
df['review_length'] = df['text'].progress_apply(review_length)
df['normalized_helpful_votes'] = df['helpful_vote'].progress_apply(normalize_helpful_votes)

100%|██████████| 91246/91246 [01:35<00:00, 953.37it/s] 
100%|██████████| 91246/91246 [02:02<00:00, 746.47it/s] 
100%|██████████| 91246/91246 [02:18<00:00, 660.20it/s] 
100%|██████████| 91246/91246 [1:06:00<00:00, 23.04it/s]
100%|██████████| 91246/91246 [00:00<00:00, 313421.33it/s]
100%|██████████| 91246/91246 [00:00<00:00, 368624.98it/s]


In [5]:
# Save the data with new features
featured_file_path = 'data/Featured_Sports_and_Outdoors.csv'
df.to_csv(featured_file_path, index=False)

In [4]:
# Display the first few rows of the dataframe
print(df.head())

first_review = df.iloc[0].to_dict()
print("\nFirst review:")
pprint(first_review)

                 timestamp  rating  helpful_vote  \
0  2023-02-25 10:30:53.520     5.0             8   
1  2018-04-07 09:36:11.676     5.0             0   
2  2022-05-26 01:01:59.105     5.0             0   
3  2021-04-07 21:43:31.976     5.0             0   
4  2021-02-10 14:02:26.894     5.0             0   

                               title  \
0                       Crazy comfy!   
1                         Excellent!   
2                   Best saddle pads   
3                        Works great   
4  Great stirrups with bar none grip   

                                                text        asin  \
0  Not gonna lie- they are not much to look at. L...  B07F3BDT8T   
1                                 I love it. Pretty!  B00NXQLFQQ   
2  Huge fan of B Vertigo and this dressage pad do...  B08SVPR266   
3  This was great for a slightly too-short girth!...  B00DV0MKUY   
4  I have to say, the grip on these are pretty gr...  B002HPNBMU   

   verified_purchase                 