In [27]:
import requests
import json
import urllib.parse
import time

def getApiSteam(repeat, num_per_page=20, url="https://store.steampowered.com/appreviews/892970?json=1&language=english", cursor='*', data=[]):
    time.sleep(num_per_page/10)
    
    if repeat == 0:
        # Flatten the list of reviews
        data = [x for xs in data for x in xs]
        # Extract only the review text
        reviewData = [i["review"] for i in data]
        
        # Save the review data to a file
        with open('reviewdata.json', 'w', encoding='utf-8') as f:
            json.dump(reviewData, f, ensure_ascii=False, indent=4)
            
        return 1
    else:
        repeat -= 1

        # Fetch the next page of reviews
        response = requests.get(f'{url}&cursor={cursor}&num_per_page={num_per_page}')
        temp = json.loads(response.text)
        # Append the new reviews to the data list
        data.append(temp["reviews"])
        # Update the cursor for the next request
        new_cursor = urllib.parse.quote(temp["cursor"])
        print(f"Fetched reviews. New cursor: {new_cursor}")
        
        # Recursively call the function with the updated cursor and data
        return getApiSteam(repeat, num_per_page, cursor=new_cursor, data=data)

def main():
    getApiSteam(20, 50)
    # Load the review data from the file
    with open("reviewdata.json", "r", encoding='utf-8') as file:
        reviewData = json.load(file)
    print("Number of reviews: ", len(reviewData))

if __name__ == "__main__":
    main()


Fetched reviews. New cursor: AoIIPwYYanDj7egE
Fetched reviews. New cursor: AoIIPwFUIn744ekE
Fetched reviews. New cursor: AoIIPwAAAHb2gOcE
Fetched reviews. New cursor: AoIIPwwLVnSJ6egE
Fetched reviews. New cursor: AoIIPwVCon%2Bo3uwE
Fetched reviews. New cursor: AoIIPwAAAHb2gOcE
Fetched reviews. New cursor: AoIIPwwLVnSJ6egE
Fetched reviews. New cursor: AoIIPwVCon%2Bo3uwE
Fetched reviews. New cursor: AoIIPwAAAHb2gOcE
Fetched reviews. New cursor: AoIIPwwLVnSJ6egE
Fetched reviews. New cursor: AoIIPwVCon%2Bo3uwE
Fetched reviews. New cursor: AoIIPwAAAHb2gOcE
Fetched reviews. New cursor: AoIIPwwLVnSJ6egE
Fetched reviews. New cursor: AoIIPwVCon%2Bo3uwE
Fetched reviews. New cursor: AoIIPwAAAHb2gOcE


KeyboardInterrupt: 

In [57]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import json

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Read Game Review Data JSON
with open("reviewdata.json","r", encoding='utf-8') as file:
        reviewData = json.load(file)
    
def preprocess(text):
    # Convert text to lowercase
    text_lower = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text_lower)
    
    # Removing punctuation and special characters
    tokens = [word for word in tokens if word.isalnum()]
    
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens
    
# Joining tokens back into a string
preprocessed_text = ' '.join(preprocess(reviewData[22]))

print("\nProcessed:\n"+preprocessed_text)
print("\n\nNot Processed:\n"+reviewData[22])


Processed:
took three try get game felt unnatural first sat put two three hour game immediately hooked like basically game ever writing 383 hour later even dove playing game friend yet absolute best part game according take space drive cost le aaa game find absolutely delightful exploration amazing genuinely love world natural difficulty make actually care preparing cooking lose stats adjusted game definitely within top ten game time


Not Processed:
It took me three tries to get into this game - it felt unnatural to me at first, but after I sat down and put about two to three hours into the game it immediately hooked me like basically no other game ever has. 

I am writing this 383 hours later, and I haven't even dove into playing the game with my friends yet (The absolute best part of the game according to most). It takes no space on your drive, it costs less than all AAA games, and I find it absolutely delightful.

Exploration is amazing - I genuinely love this world.
Natural diffi

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\reapy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reapy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\reapy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Get Steam Reviews JSON

In [5]:
import steamreviews

request_params = dict()
# Reference: https://partner.steamgames.com/doc/store/getreviews
request_params['filter'] = 'all'  # reviews are sorted by helpfulness instead of chronology
request_params['language'] = 'english'
request_params['day_range'] = '84'  # focus on reviews which were published during the past four weeks

steamreviews.download_reviews_for_app_id_batch(chosen_request_params=request_params)

Loading idlist.txt
Loading idprocessed_on_20240422.txt
Creating idprocessed_on_20240422.txt
Downloading reviews for appID = 1623730
[appID = 1623730] expected #reviews = 124655
[appID = 1623730] num_reviews = 5349 (expected: 124655)
Downloading reviews for appID = 892970
[appID = 892970] expected #reviews = 262477
[appID = 892970] num_reviews = 600 (expected: 262477)
Downloading reviews for appID = 105600
[appID = 105600] expected #reviews = 636028
[appID = 105600] num_reviews = 1077 (expected: 636028)
Downloading reviews for appID = 346110
[appID = 346110] expected #reviews = 304609
[appID = 346110] num_reviews = 398 (expected: 304609)
Downloading reviews for appID = 252490
[appID = 252490] expected #reviews = 532677
[appID = 252490] num_reviews = 1369 (expected: 532677)
Downloading reviews for appID = 275850
[appID = 275850] expected #reviews = 176517
[appID = 275850] num_reviews = 694 (expected: 176517)
Downloading reviews for appID = 322330
[appID = 322330] expected #reviews = 1165

True

### Merge Multiple JSON to one CSV

In [7]:
import os
import json
import csv

# Directory containing the JSON files
json_directory = os.getcwd()+"/data/"

# Output CSV file path
csv_file_path = 'game_reviews.csv'

# Initialize an empty list to hold the extracted data
data = []

# Iterate over each file in the directory
for filename in os.listdir(json_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(json_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            # Load the JSON file
            json_data = json.load(file)
            # Extract reviews
            for review_id, review_data in json_data['reviews'].items():
                review_id = review_id
                # Determine sentiment
                sentiment = 'positive' if review_data['voted_up'] else 'negative'
                # Extract the review text
                review_text = review_data['review']
                # Append the data to the list
                timestamp_updated = review_data["timestamp_updated"]
                data.append([review_id, sentiment, review_text, timestamp_updated])
# Write the data to a CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    # Write the header
    writer.writerow(['review_id','sentiment', 'review', 'timestamp_updated'])
    # Write the data
    writer.writerows(data)

print(f"Data has been written to {csv_file_path}")

Data has been written to game_reviews.csv


In [18]:
import torch
torch.cuda.is_available()
torch.set_default_device('cuda')

In [22]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [40]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, load_dataset, load_metric
from huggingface_hub import notebook_login

torch.set_default_device('cpu')


df = pd.read_csv('output.csv')
df = pd.DataFrame(df)
train=df.sample(frac=0.8,random_state=333)
test=df.drop(train.index)

# # Print the number of empty values
# print("Number of empty values in train 'review' column:", train['review'].isna().sum())
# print("Number of empty values in test 'review' column:", test['review'].isna().sum())

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
   return tokenizer(examples['text'], truncation=True)

tokenized_train = train_dataset.map(preprocess_function)
tokenized_test = test_dataset.map(preprocess_function)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}
    
repo_name = "finetuning-distilbert-model-steam-game-reviews"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()
trainer.push_to_hub()

Map:   0%|          | 0/22329 [00:00<?, ? examples/s]

Map:   0%|          | 0/5582 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


KeyboardInterrupt: 

In [2]:
from transformers import pipeline
sentiment_model = pipeline(model="zitroeth/finetuning-distilbert-model-steam-game-reviews")
sentiment_model(["It’s astonishing how much has been packed into Palworld given its Early Access status. Sure, the developers will probably expand the world, add in more Pals, and grow the story, too. But what we have here is already great, and features so many small details you’ll be amazed at what you discover. It’s a little rough around the edges in places, but the blueprint and starting product are more than enough to keep you going. With a dedicated community, Palworld can only get better. Is it a Pokemon beater? Probably not, but it deserves its place in the conversation."])

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'label': 'LABEL_1', 'score': 0.9984083771705627}]

## Importing the Required Libraries

In [96]:
import nltk
nltk.download("stopwords")
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#Spacy
import spacy
from nltk.corpus import stopwords
# !python -m spacy download en
#vis
import pyLDAvis
import pyLDAvis.gensim_models

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reapy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [97]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Preparing the Data

In [98]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f, strict=False) 
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [99]:
stopwords = stopwords.words("english")
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [100]:
data = load_data("game_reviews.json")

print (data[1][2])

There was a review a long time ago saying you should buy extra copies and give it to people who look sad.

I was one of those sad people.

Was.


In [101]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text[2])
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags and not token.is_stop:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[1])

review long time ago say buy extra copy people look sad sad people


In [102]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[3114])

['goosebump', 'arm', 'hair', 'raise', 'survival', 'zombie', 'slayer', 'decent', 'story', 'great', 'scenery']


In [103]:
# BIGRAM and TRIGRAM
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=50) # higher threshold fewer phrases.
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)  

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return ([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print(data_bigrams_trigrams[0])

2024-04-29 05:00:21,735 : INFO : collecting all words and their counts
2024-04-29 05:00:21,736 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2024-04-29 05:00:22,282 : INFO : PROGRESS: at sentence #10000, processed 314499 words and 205428 word types
2024-04-29 05:00:23,349 : INFO : PROGRESS: at sentence #20000, processed 843415 words and 466977 word types
2024-04-29 05:00:24,201 : INFO : collected 689506 token types (unigram + bigrams) from a corpus of 1362942 words and 27935 sentences
2024-04-29 05:00:24,201 : INFO : merged Phrases<689506 vocab, min_count=5, threshold=50, max_vocab_size=40000000>
2024-04-29 05:00:24,202 : INFO : Phrases lifecycle event {'msg': 'built Phrases<689506 vocab, min_count=5, threshold=50, max_vocab_size=40000000> in 2.47s', 'datetime': '2024-04-29T05:00:24.202656', 'gensim': '4.3.2', 'python': '3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event'

['play', 'game', 'hour', 'get', 'bored']


In [104]:
#TF-IDF Removal
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[0])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value=0.020
words = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words + words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  
    corpus[i] = new_bow

2024-04-29 05:00:39,724 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-04-29 05:00:40,144 : INFO : adding document #10000 to Dictionary<15232 unique tokens: ['bored', 'game', 'get', 'hour', 'play']...>
2024-04-29 05:00:40,834 : INFO : adding document #20000 to Dictionary<24598 unique tokens: ['bored', 'game', 'get', 'hour', 'play']...>
2024-04-29 05:00:41,505 : INFO : built Dictionary<30730 unique tokens: ['bored', 'game', 'get', 'hour', 'play']...> from 27935 documents (total 1321816 corpus positions)
2024-04-29 05:00:41,505 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<30730 unique tokens: ['bored', 'game', 'get', 'hour', 'play']...> from 27935 documents (total 1321816 corpus positions)", 'datetime': '2024-04-29T05:00:41.505484', 'gensim': '4.3.2', 'python': '3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
2024-04-29 05:00:42,698 : INFO : co

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


2024-04-29 05:00:43,045 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 27935 documents and 30730 features (985739 matrix non-zeros)', 'datetime': '2024-04-29T05:00:43.045980', 'gensim': '4.3.2', 'python': '3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'initialize'}


In [105]:
# # id2word = corpora.Dictionary(data_words)
# data_words = data_bigrams_trigrams
# id2word = corpora.Dictionary(data_words)


# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

# print (corpus[1])

# word = id2word[[0][:1][0]]
# print (word)

In [106]:
print (len(corpus))

27935


In [107]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=28000,
                                           passes=30,
                                           alpha="auto",
                                           eval_every = None,
                                           )

2024-04-29 05:00:48,535 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2024-04-29 05:00:48,536 : INFO : using symmetric eta at 0.1
2024-04-29 05:00:48,543 : INFO : using serial LDA version on this node
2024-04-29 05:00:48,585 : INFO : running online (multi-pass) LDA training, 10 topics, 30 passes over the supplied corpus of 27935 documents, updating model once every 27935 documents, evaluating perplexity every 0 documents, iterating 50x with a convergence threshold of 0.001000
2024-04-29 05:00:48,587 : INFO : PROGRESS: pass 0, at document #27935/27935
2024-04-29 05:01:13,369 : INFO : optimized alpha [0.10735683, 0.08379872, 0.09760615, 0.095402345, 0.10471499, 0.100270875, 0.10102429, 0.098011605, 0.095030904, 0.09782151]
2024-04-29 05:01:13,417 : INFO : topic #1 (0.084): 0.036*"game" + 0.008*"survival" + 0.007*"feel" + 0.007*"play" + 0.005*"come" + 0.005*"great" + 0.005*"enjoy" + 0.004*"fun" + 0.004*"thing" + 0.004*"build"
2024-04-29 0

In [108]:
## Visualize Data

In [109]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis