In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import wordnet
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import csv
import os
import requests
import pickle
from bs4 import BeautifulSoup


## web scraping on Reviews URLs

In [None]:
df = pd.DataFrame(columns = ["ID","Reviews"])
# Read CSV file
# with open('games-regression-dataset.csv', newline='') as csvfile:
with open('games-regression-dataset.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header row
    for row in reader:
        url = row[0]  # URL is in first column
        filename = 'Reviews/'+os.path.basename(url)  # Extract filename from URL
        url +=  "?see-all=reviews"
        response = requests.get(url)
        if response.status_code == 200:  # Check if request was successful
            soup = BeautifulSoup(response.text, 'html.parser')
            blocks = soup.findAll("blockquote")
            review_list = []
            for blockquote in blocks:
                review = blockquote.find('p').text
                review_list.append(review)
            if len(review_list)!=0:
                filename = re.sub(r'[^\d]+', '', filename)
                new_row = {'ID': filename,"Reviews": review_list}
                df = df._append(new_row, ignore_index=True)
df.to_csv('Reviews.csv', index=False)


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')
stop_words = set(stopwords.words('english'))

### Splitting reviews

In [None]:
data = pd.read_csv('Reviews.csv')
counter=0

for i in range (len(data)):
    data.at[i, 'Reviews'] = data.at[i, "Reviews"].split("',")
    data.at[i,"ID"] =data.at[i,"ID"]
data = data.explode('Reviews')
data

### Preprocessing


In [None]:

# Convert text to lowercase
data['Reviews'] = data['Reviews'].apply(lambda x: str(x).lower())

# Replace newline characters with an empty string
data['Reviews'] = data['Reviews'].apply(lambda x: re.sub(r'\\n', ' ', x))

# Remove black squares
data['Reviews'] = data['Reviews'].apply(lambda x: re.sub(r'\\u25a0', '', x))

# Remove special characters and punctuations
data['Reviews'] = data['Reviews'].apply(lambda x: re.sub(r'[^\w\s]+', '', x))

# Remove numbers
data['Reviews'] = data['Reviews'].apply(lambda x: " ".join([word for word in x.split() if not any(char.isdigit() for char in word)]))

# Remove extra whitespaces
data['Reviews'] = data['Reviews'].apply(lambda x: re.sub(r'\s+', ' ', x))

# Remove stop words
data['Reviews'] = data['Reviews'].apply(lambda x: " ".join([word for word in x.lower().split() if word not in stop_words]))

# Remove empty strings
data = data[data['Reviews'].apply(lambda x: len(x)>0)]

data.at[0,'Reviews']

### Vader-model

In [None]:
sia_reviews = SentimentIntensityAnalyzer()
# pickle.dump(sia_reviews, open('encoders/sia_reviews.pkl', 'wb'))
data['Reviews'] = data['Reviews'].apply(lambda x: dict(sia_reviews.polarity_scores(x))['compound'])


In [None]:
data

### Get compound average & Group by id

In [None]:
data = data.groupby('ID')['Reviews'].apply(list).reset_index()

In [None]:
# Get the lowest, highest and average purchase
data['lowest_review'] = data['Reviews'].apply(lambda x: min(x) if len(x) > 0 else 0)
data['highest_review'] = data['Reviews'].apply(lambda x: max(x) if len(x) > 0 else 0)
data['average_review'] = data['Reviews'].apply(lambda x: np.mean(x) if len(x) > 0 else 0)
data = data.drop(['Reviews'],axis=1)

In [None]:
data

In [None]:
data.to_csv('reviews_results.csv', index=False)