# API to get reviews

In [1]:
import steamreviews

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [172]:
import datetime
import json
import pathlib
import time

import requests


def parse_app_id(app_id):
    # Objective: return an app_id as an integer, no matter whether the input app_id is a string or an integer.
    try:
        return int(str(app_id).strip())
    except ValueError:
        return None


def get_input_app_ids_filename():
    # Objective: return the filename where input app_ids are stored.
    return 'idlist.txt'


def app_id_reader(filename=None):
    # Objective: return a generator of the app_ids to process.

    if filename is None:
        filename = get_input_app_ids_filename()

    with open(filename, 'r') as f:
        for row in f.readlines():
            yield parse_app_id(row)


def get_processed_app_ids_filename(filename_root='idprocessed'):
    # Objective: return the filename where processed app_ids are saved.

    # Get current day as yyyymmdd format
    current_date = time.strftime('%Y%m%d')

    processed_app_ids_filename = filename_root + '_on_' + current_date + '.txt'

    return processed_app_ids_filename


def get_processed_app_ids():
    # Objective: return a set of all previously processed app_ids.

    processed_app_ids_filename = get_processed_app_ids_filename()

    all_app_ids = set()
    try:
        for app_id in app_id_reader(processed_app_ids_filename):
            all_app_ids.add(app_id)
    except FileNotFoundError:
        print('Creating ' + processed_app_ids_filename)
        pathlib.Path(processed_app_ids_filename).touch()
    return all_app_ids


def get_default_request_parameters(chosen_request_params=None):
    # Objective: return a dict of default paramters for a request to Steam API.
    #
    # References:
    #   https://partner.steamgames.com/doc/store/getreviews
    #   https://partner.steamgames.com/doc/store/localization#supported_languages
    #   https://gist.github.com/adambuczek/95906b0c899c5311daeac515f740bf33

    default_request_parameters = {
        'json': '1',
        'language': 'all',  # API language code e.g. english or schinese
        'filter': 'recent',  # To work with 'start_offset', 'filter' has to be set to either recent or updated, not all.
        'review_type': 'all',  # e.g. positive or negative
        'purchase_type': 'all',  # e.g. steam or non_steam_purchase
        'num_per_page': '100',  # default is 20, maximum is 100
    }

    if chosen_request_params is not None:
        for element in chosen_request_params:
            default_request_parameters[element] = chosen_request_params[element]

    return default_request_parameters


def get_data_path():
    # Objective: return the path to the directory where reviews are stored.

    data_path = 'data/'

    # Reference of the following line: https://stackoverflow.com/a/14364249
    pathlib.Path(data_path).mkdir(parents=True, exist_ok=True)

    return data_path


def get_steam_api_url():
    # Objective: return the url of Steam API for reviews.

    return 'https://store.steampowered.com/appreviews/'


def get_steam_api_rate_limits():
    # Objective: return the rate limits of Steam API for reviews.

    rate_limits = {
        'max_num_queries': 150,
        'cooldown': 30,  # 5 minutes plus a cushion
        'cooldown_bad_gateway': 10,  # arbitrary value to tackle 502 Bad Gateway due to saturated servers (during sales)
    }

    return rate_limits


def get_output_filename(app_id):
    return get_data_path() + 'review_' + str(app_id) + '.json'


def get_dummy_query_summary():
    query_summary = dict()
    query_summary['total_reviews'] = -1

    return query_summary


def load_review_dict(app_id):
    review_data_filename = get_output_filename(app_id)

    try:
        with open(review_data_filename, 'r', encoding='utf8') as in_json_file:
            review_dict = json.load(in_json_file)

        # Compatibility with data downloaded with previous versions of steamreviews:
        if 'cursors' not in review_dict.keys():
            review_dict['cursors'] = dict()
    except FileNotFoundError:
        review_dict = dict()
        review_dict['reviews'] = dict()
        review_dict['query_summary'] = get_dummy_query_summary()
        review_dict['cursors'] = dict()

    return review_dict


def get_request(app_id, chosen_request_params=None):
    request = dict(get_default_request_parameters(chosen_request_params))
    request['appids'] = str(app_id)

    return request


def download_reviews_for_app_id_with_offset(app_id,
                                            query_count,
                                            cursor='*',
                                            chosen_request_params=None):
    rate_limits = get_steam_api_rate_limits()

    req_data = get_request(app_id, chosen_request_params)
    req_data['cursor'] = str(cursor)

    resp_data = requests.get(get_steam_api_url() + req_data['appids'], params=req_data)
    status_code = resp_data.status_code
    query_count += 1

    while (status_code == 502) and (query_count < rate_limits['max_num_queries']):
        cooldown_duration_for_bad_gateway = rate_limits['cooldown_bad_gateway']
        print('{} Bad Gateway for appID = {} and cursor = {}. Cooldown: {} seconds'.format(status_code,
                                                                                           app_id,
                                                                                           cursor,
                                                                                           cooldown_duration_for_bad_gateway))
        time.sleep(cooldown_duration_for_bad_gateway)

        resp_data = requests.get(get_steam_api_url() + req_data['appids'], params=req_data)
        status_code = resp_data.status_code
        query_count += 1

    if status_code == 200:
        result = resp_data.json()
    else:
        result = {'success': 0}
        print('Faulty response status code = {} for appID = {} and cursor = {}'.format(status_code, app_id, cursor))

    success_flag = bool(result['success'] == 1)

    try:
        downloaded_reviews = result['reviews']
        query_summary = result['query_summary']
        next_cursor = result['cursor']
    except KeyError:
        success_flag = False
        downloaded_reviews = []
        query_summary = get_dummy_query_summary()
        next_cursor = cursor

    return success_flag, downloaded_reviews, query_summary, query_count, next_cursor


def download_reviews_for_app_id(app_id,
                                query_count=0,
                                chosen_request_params=None,
                                start_cursor='*',  # this could be useful to resume a failed download of older reviews
                                verbose=False):
    rate_limits = get_steam_api_rate_limits()

    request = dict(get_default_request_parameters(chosen_request_params))
    check_review_timestamp = bool('day_range' in request.keys() and request['filter'] != 'all')
    if check_review_timestamp:
        current_date = datetime.datetime.now()
        num_days = int(request['day_range'])
        date_threshold = current_date - datetime.timedelta(days=num_days)
        timestamp_threshold = datetime.datetime.timestamp(date_threshold)
        if verbose:
            if request['filter'] == 'updated':
                collection_keyword = 'edited'
            else:
                collection_keyword = 'first posted'
            print('Collecting reviews {} after {}'.format(collection_keyword,
                                                          date_threshold))

    review_dict = load_review_dict(app_id)

    previous_review_ids = set(review_dict['reviews'])

    num_reviews = None

    offset = 0
    cursor = start_cursor
    new_reviews = []
    new_review_ids = set()

    while (num_reviews is None) or len(new_review_ids)<5000:
#     (offset < num_reviews):

        if verbose:
            print('Cursor: {}'.format(cursor))

        success_flag, downloaded_reviews, query_summary, query_count, cursor = download_reviews_for_app_id_with_offset(
            app_id,
            query_count,
            cursor,
            chosen_request_params)

        delta_reviews = len(downloaded_reviews)

        offset += delta_reviews

        if success_flag and delta_reviews > 0:

            if check_review_timestamp:

                if request['filter'] == 'updated':
                    timestamp_str_field = 'timestamp_updated'
                else:
                    timestamp_str_field = 'timestamp_created'

                checked_reviews = list(filter(lambda x: x[timestamp_str_field] > timestamp_threshold,
                                              downloaded_reviews))

                delta_checked_reviews = len(checked_reviews)
                print(len(new_review_ids))
                if delta_checked_reviews == 0:
                    if verbose:
                        print('Exiting the loop to query Steam API, because the timestamp threshold was reached.')
                    break
                else:
                    downloaded_reviews = checked_reviews

            new_reviews.extend(downloaded_reviews)

            downloaded_review_ids = [review['recommendationid'] for review in downloaded_reviews]

            # Detect full redundancy in the latest downloaded reviews
            if new_review_ids.issuperset(downloaded_review_ids):
                if verbose:
                    print('Exiting the loop to query Steam API, because this request only returned redundant reviews.')
                break
            else:
                new_review_ids = new_review_ids.union(downloaded_review_ids)

        else:
            if verbose:
                print('Exiting the loop to query Steam API, because this request failed.')
            break

        if num_reviews is None:
            review_dict['query_summary'] = query_summary
            # Initialize num_reviews with the correct value (this is crucial for the loop, do not change variable name):
            num_reviews = query_summary['total_reviews']
            # Also rely on num_reviews for display:
            print('[appID = {}] expected #reviews = {}'.format(app_id, num_reviews))

        if query_count >= rate_limits['max_num_queries']:
            cooldown_duration = rate_limits['cooldown']
            print('Number of queries {} reached. Cooldown: {} seconds'.format(query_count, cooldown_duration))
            time.sleep(cooldown_duration)
            query_count = 0

        if not previous_review_ids.isdisjoint(downloaded_review_ids):
            if verbose:
                print('Exiting the loop to query Steam API, because this request partially returned redundant reviews.')
            break

    # Keep track of the date (in string format) associated with the cursor at save time.
    review_dict['cursors'][str(cursor)] = time.asctime()

    for review in new_reviews:
        review_id = review['recommendationid']
        if review_id not in previous_review_ids:
            review_dict['reviews'][review_id] = review

    with open(get_output_filename(app_id), 'w') as f:
        f.write(json.dumps(review_dict) + '\n')

    return review_dict, query_count


def download_reviews_for_app_id_batch(input_app_ids=None,
                                      previously_processed_app_ids=None,
                                      chosen_request_params=None,
                                      verbose=False):
    if input_app_ids is None:
        print('Loading {}'.format(get_input_app_ids_filename()))
        input_app_ids = [app_id for app_id in app_id_reader()]

    if previously_processed_app_ids is None:
        print('Loading {}'.format(get_processed_app_ids_filename()))
        previously_processed_app_ids = get_processed_app_ids()

    query_count = 0
    game_count = 0

    for app_id in input_app_ids:

        if app_id in previously_processed_app_ids:
            print('Skipping previously found appID = {}'.format(app_id))
            continue
        else:
            print('Downloading reviews for appID = {}'.format(app_id))

        review_dict, query_count = download_reviews_for_app_id(app_id,
                                                               query_count,
                                                               chosen_request_params,
                                                               verbose=verbose)

        game_count += 1

        with open(get_processed_app_ids_filename(), 'a') as f:
            f.write(str(app_id) + '\n')

        num_downloaded_reviews = len(review_dict['reviews'])
        num_expected_reviews = review_dict['query_summary']['total_reviews']
        print('[appID = {}] num_reviews = {} (expected: {})'.format(app_id,
                                                                    num_downloaded_reviews,
                                                                    num_expected_reviews))

    print('Game records written: {}'.format(game_count))

    return True

In [175]:
request_params = dict()
# Reference: https://partner.steamgames.com/doc/store/getreviews
request_params['filter'] = 'recent'  # reviews are sorted by helpfulness instead of chronology
# request_params['day_range'] = '9223372036854775807'  # focus on reviews which were published during the past four weeks
request_params['language'] = 'english'
request_params['day_range'] = '1000'
request_params['num_per_page'] = '100'

# app_id = 1091500
app_id = 292030
review_dict, query_count = download_reviews_for_app_id(app_id,
                                                                    chosen_request_params=request_params,
                                                                    verbose=True)


Collecting reviews first posted after 2018-07-17 18:01:33.019624
Cursor: *
0
[appID = 292030] expected #reviews = 156987
Cursor: AoJwo6Wf4fgCerDz1wI=
100
Cursor: AoJ47NbO3vgCc/3S1wI=
200
Cursor: AoJ41df12/gCdbSy1wI=
300
Cursor: AoJ4v6Kg2fgCeqmU1wI=
400
Cursor: AoJ4mYP91vgCd9v71gI=
500
Cursor: AoJwzeWs1PgCf7/e1gI=
600
Cursor: AoJ4k9Dz0fgCdfzD1gI=
700
Cursor: AoJ4gYnpz/gCcoqq1gI=
800
Cursor: AoJ4tP7szfgCduSP1gI=
900
Cursor: AoJ4roKfy/gCerHt1QI=
1000
Cursor: AoJw9vr8yPgCdMnO1QI=
1100
Cursor: AoJ4/enyxvgCcaS01QI=
1200
Cursor: AoJ46tm6xPgCdauX1QI=
1300
Cursor: AoJ4oer9wfgCfPD61AI=
1400
Cursor: AoJ4trTpv/gCdovk1AI=
1500
Cursor: AoJ41ZSVvfgCcPXF1AI=
1600
Cursor: AoJwl6XCuvgCcvKj1AI=
1700
Cursor: AoJwiuOZuPgCcJ+G1AI=
1800
Cursor: AoJwgaq2tfgCe7Xm0wI=
1900
Cursor: AoJwyPuqsvgCefPD0wI=
2000
Cursor: AoJw/YPjr/gCdLSn0wI=
2100
Cursor: AoJ4joLLq/gCd9P80gI=
2200
Cursor: AoJwyonYpvgCcuLB0gI=
2300
Cursor: AoJwyLHPoPgCdJz90QI=
2400
Cursor: AoJw28/BnPgCdv3T0QI=
2500
Cursor: AoJw+LuLmPgCdL+j0QI=
2600
Curs

In [168]:
import requests

def get(appid, printProgress=False):
    '''Request reviews from the Steam Web API and return them as a list. This is a blocking call that may take some time, depending on how many reviews there are.\n
    **appid** -- The Steam App ID as a string obtained from the game's store page URL\n
    **progress** -- Set to true to print the progress of each request.
    '''
    assert type(appid) is str

    def _makeRequest(appid, params):
        '''Helper function that sends a request to the Steam Web API and returns the response object.\n
        **appid** -- The Steam App ID obtained from the game's Store page URL\n
        **params** -- An object used to build the Steam API query. (https://partner.steamgames.com/doc/store/getreviews)
        '''
        response = requests.get(url=ENDPOINT+appid, params=params) # get the data from the endpoint
        return response.json() # return data extracted from the json response

    ENDPOINT = 'https://store.steampowered.com/appreviews/' # https://partner.steamgames.com/doc/store/getreviews
    results = []
    params = {
        'json': 1,
        'filter': 'recent', # sort by: recent, update
        'language': 'english', # languages at https://partner.steamgames.com/doc/store/localization
        'start_offset': 0, # for pagination
        'review_type': 'all', # all, positive, negative
        'purchase_type': 'steam', # all, non_steam_purchase, steam
        'day_range': '1000',
        'num_per_page': '100'
    }

    data = _makeRequest(appid, params)
    done = False

    while not done:
        if 'success' in data and data['success'] == 1: # if the query was successful
            if 'reviews' in data and len(data['reviews']) > 0: # if we received reviews
                results += data['reviews'] # add the reviews in this query to our results
                if len(results)>5000:
                    done=True
                params['start_offset'] += data['query_summary']['num_reviews'] # increase the start offset by the number of reviews received in this response
                if printProgress:
                    print('{amount} reviews found...'.format(amount=params['start_offset']))
                data = _makeRequest(appid, params) # get the next page of reviews
            else: # there are no more reviews
                done = True
        elif data is None: # Steam Web API returns null if rate limit is reached.
            done = True
            raise ConnectionRefusedError('Steam Web API returned null. Rate limit may be exceeded.')
        else: # unsuccessful API call raises an error
            done = True
            raise ConnectionError('Steam Web API appreviews request was unsuccessful.')
    
    if printProgress:
        print("Found all reviews.")
    return results

In [169]:
results = get('292030', printProgress=True)

100 reviews found...
200 reviews found...
300 reviews found...
400 reviews found...
500 reviews found...
600 reviews found...
700 reviews found...


KeyboardInterrupt: 

In [181]:
print(review_dict['reviews']['87613858'])

{'recommendationid': '87613858', 'author': {'steamid': '76561198835473808', 'num_games_owned': 19, 'num_reviews': 1, 'playtime_forever': 7779, 'playtime_last_two_weeks': 0, 'playtime_at_review': 7779, 'last_played': 1614440944}, 'language': 'english', 'review': 'Story is good.', 'timestamp_created': 1614524913, 'timestamp_updated': 1614524913, 'voted_up': True, 'votes_up': 0, 'votes_funny': 0, 'weighted_vote_score': 0, 'comment_count': 0, 'steam_purchase': True, 'received_for_free': False, 'written_during_early_access': False}


# Filter Results

In [216]:
import os
import json

import pandas as pd

## Filter
1. English (done during API)
2. steam_purchased (done during API)
3. not written_during_early_access
4. play_time_forever > 1 hour
5. $>$ 20 words (do in cleaning)

In [192]:
results = list(filter(lambda x: x['written_during_early_access']==False and x['author']['playtime_forever']>60 and x['language']=='english', review_dict['reviews'].values()))
print(len(results))
results = {x['recommendationid']: x for x in results}


4964


## Cleaning

In [310]:
import re

In [311]:
# Remove punctuations, lowercase
def clean_review(review):
    review = re.sub('[^A-Za-z]+', ' ', review).strip()
    split = review.split()
    if len(split)<=10:
        return ''
    review = ' '.join([w.lower() for w in split if len(w)>2])
    return review

In [449]:
reviews = [r['review'] for r in results.values()]
print(len(reviews))
reviews = list(map(clean_review, reviews))
reviews = list(filter(lambda x: x!='', reviews))
print(len(reviews))
print(STOPWORDS)

4964
1905
frozenset({'system', 'yet', 'former', 'thereby', 'anyone', 'in', 'moreover', 'bottom', 'during', 'between', 'of', 'among', 'nobody', 'thin', 'she', 'their', 'become', 'several', 'while', 'however', 'anywhere', 'could', 'some', 'see', 'sixty', 'somehow', 'neither', 'already', 'whatever', 'never', 'via', 'show', 'your', 'otherwise', 'seem', 'rather', 'bill', 'and', 'seeming', 'couldnt', 'back', 'further', 'none', 'con', 'someone', 'first', 'most', 'don', 'such', 'now', 'thick', 'from', 'is', 'became', 'enough', 'us', 'would', 'fifty', 'down', 'fill', 'cry', 'has', 'there', 'three', 'these', 'least', 'how', 'so', 'ie', 'therefore', 'ever', 'still', 'besides', 'found', 'for', 'just', 'very', 'though', 'thereupon', 'the', 'whenever', 'go', 'serious', 'below', 'well', 'any', 'one', 'de', 'formerly', 'without', 'anyway', 'else', 'was', 'across', 'yours', 'themselves', 'latterly', 'upon', 'mostly', 'forty', 'each', 'whose', 'using', 'others', 'whether', 'whom', 'that', 'various', 'no

In [313]:
reviews

['the open world really fun alot this game',
 'the story line game play and beautiful scenery you could tell lot planning and development went into this game just downloaded and started playing the game recently and truly master piece',
 'have been playing this game now quite few hours and lot fun the map vast and have feeling that will take quite some time finish can recomend this game',
 'cant express enough how felt seeing the game moving towards the end having played the entire trilogy recently got super engrossed this world the witcher story line something that has always been strong point each the parts this series and this game betters the gameplay well the sheer breadth the game wide that you encounter countless characters and their individual stories and get attached them your way completing this epic journey you want play this game highly recommended that you atleast play witcher not for the first part one kind game that haven seen years and years gaming',
 'story catches you

## Lemmatization and Stopwords

In [314]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.parsing.preprocessing import STOPWORDS
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [432]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer("english")


# Keep noun
is_noun_or_adj = lambda pos: pos[:2]=='NN' or pos[:2]=='JJ'
lemmatizer = WordNetLemmatizer()

def noun_adj_processor(word, pos):
    if pos[:2] == 'NN':
#         return stemmer.stem(lemmatizer.lemmatize(word, pos='n'))
        return lemmatizer.lemmatize(word, pos='n')
    else:
        return lemmatizer.lemmatize(word, pos='a')

def keep_noun_and_adj(rev_split):
    return [noun_adj_processor(word, pos) for (word,pos) in nltk.pos_tag(rev_split) if is_noun_or_adj(pos)]

keep_noun_and_adj(['witch', 'witcher'])

['witch', 'witcher']

In [316]:
stop_words = stopwords.words('english')
stop_words.extend(STOPWORDS)
stop_words.extend(['player', 'play', 'people', 'game', 'steam','review',
                  'day', 'year', 'hour', 'minute','time','moment',
                  'world', 'yes', 'lol', 'lmao', 'cool', 'love', 'get',
                  'good', 'great', 'nice', 'best', 'fun', 'awesome',
                   'ever', 'kinda', 'shit', 'yeah', 'new', 'old',
                  'big','small','high','low','many','much','lot','alot',
                  'others','thank','http','thing','everyone','anyone','anything',
                  'everything','cant','dont','guy','hello',
                  'youtube','something','someone','pro','con',
                  'haha', 'hehe','end','nothing','no','one',
                  'fine','first','last','epic','english','bit',
                  'terrible','overall','original','life','bad',
                  'today','fps','gameplay','favorite','favourite','com',
                  'man','word','version','pure','experience','www',
                  'please','thanks','little','least','way','different',
                  'style','man','men','super','problem','item','work','computer',
                  'stuff','wait','early','access','sure','able','developer','wow','genre',
                  'potential','perfect','popular','reason','person'])
stop_words.extend(['server','update','performance','system','program','software'])
stop_words.extend(['worth','full','point','real','part','amount','reason',
                  'option','open','previous','huge','enjoyable','kind','ton',
                  'person','launch','opinion','month','ability','current','use',
                  'stupid','mess','slash','wololo','wrong'])

stop_words = list(set(stop_words))

In [381]:
def transform_and_remove_words(review):
    review_split = review.split()
    review_split = keep_noun_and_adj(review_split)
    review_split = list(filter(lambda x: x not in stop_words, review_split))
    review_split = keep_noun_and_adj(review_split)
    review_split = list(filter(lambda x: x not in stop_words, review_split))
    return " ".join(review_split)

## Key phrase extraction

In [382]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfvec = TfidfVectorizer(smooth_idf=True, ngram_range=(2,2), stop_words=STOPWORDS,max_df=0.8,use_idf=True)
fv_tfidf = tfvec.fit_transform(reviews)
fv_tfidf = fv_tfidf[0]

pdf = pd.DataFrame(fv_tfidf.T.todense(),index=tfvec.get_feature_names(),columns=['tfidf'])
pdf = pdf[pdf['tfidf']!=0]
pdf = pdf.sort_values(by=["tfidf"],ascending=False)[:20]
print(pdf.sort_values(by=["tfidf"],ascending=False)[:20])

               tfidf
alot game   0.564459
fun alot    0.564459
world fun   0.535340
open world  0.276012


In [383]:
transformed_reviews = list(map(transform_and_remove_words, reviews))
transformed_reviews = list(filter(lambda x: x != '', transformed_reviews))
transformed_reviews

['story line beautiful scenery planning development master piece',
 'map vast finish',
 'entire trilogy story line strong series sheer wide countless character individual story journey witcher',
 'story catch exploration rpg',
 'action adventure story scene voice superb protagonist character quest story range choice story main character moral choice realistic choice single rpg combat variety fight progression combination loot gear interesting range stats talent talent talent feature talent general friend impressive map hard novigrad feel city joy',
 'character corny absolute bore fest character story forgettable fuck character bios book quest quest quick par gore mechanic graphic decent need ibm rig optimization bug frame hicups decade skyrim mile wish buck live gwent',
 'music vibe goils',
 'strong support vast content aspect gwent regular mini card incomprehensible gem rich story action massive impact story select secondary quest yup quest rating',
 'content immersive horse control f

## Keyword Extraction

### RAKE

In [384]:
from rake_nltk import Rake

In [403]:
# from gensim.summarization import keywords
# from gensim.summarization import keywords

# for review in reviews:
#     keywords = gensim.summarization.keywords(review)
#     print(keywords)

# whole_review = " ".join(transformed_reviews)
# r = Rake()
# r.extract_keywords_from_text(whole_review)
# print(r.get_ranked_phrases()[:20])

# import yake


# r = Rake()
# review_keywords = list()
# for review in transformed_reviews:
# #     print(review)
#     r.extract_keywords_from_text(review)
#     key_words_dict_scores = r.get_word_degrees()
#     print(key_words_dict_scores)
# #     lst = list(key_words_dict_scores.items())
# #     lst = sorted(lst, key=lambda e:e[1], reverse=True)

#     keywords = " ".join(key_words_dict_scores.keys())
# #     print(keywords)
#     if set(review.split()) != set(keywords.split()):
#         print(review)
#         print(keywords)
#     review_keywords.append(keywords)
    
# import spacy
# nlp = spacy.load("en_core_sci_lg")
# for review in transformed_reviews:
#     doc = nlp(review)
#     print(doc.ents)

import yake

language = "en"
max_ngram_size = 1
deduplication_threshold = 0.9
numOfKeywords = 20
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
# keywords = custom_kw_extractor.extract_keywords(whole_review)
# keywords = sorted(keywords, key=lambda item:(item[1]), reverse=True)[:20]
# print(keywords)

review_keywords = list()
for review in transformed_reviews:
    keywords = custom_kw_extractor.extract_keywords(review)
    keywords = sorted(keywords, key=lambda item:(item[1]), reverse=True)[:10]
    keywords = [x[0] for x in keywords]
    keywords = " ".join(keywords)
    print(keywords)
#     if set(review.split()) != set([x[0] for x in keywords]):
#         print(" ".join([x[0] for x in keywords]))
#         print(review)
    if len(keywords)>0:
        review_keywords.append(keywords)

line beautiful scenery planning development master story piece
vast map finish
trilogy line strong series sheer wide countless character individual journey
catch exploration story rpg
adventure scene voice superb protagonist quest main moral realistic single
corny absolute bore fest story forgettable fuck bios book quick
vibe music goils
support vast content aspect gwent regular mini card incomprehensible gem
immersive horse control content fix
combat graphic story content
perfection
story character
story cash fallout combat generic feel special exploration map mermable
deep story decision matter bos blood amazing wine
throw baby
spell specific loser jeeez half parent basement banng enemy weakness
serise
release date
potato
quest meaningful deep rpg story
mechanic combat story
multiple similar skyrim dragon age platform
insane graphic mod
wouldnt story load nerd dialogue
mission secondary story
witcher random middle wood character slow finish mission story area
combat terrific quest wi

reality beautiful exe wall eargasm earrape audience decent paint audio
sale bar hype rpg
fact console
story feel quick contract building easy rpg aspect decision consequence
rpg number tall
raider meet story graphic witcher series tomb favor
free sale playing
dlc sidequests week graphic story group monster creativity bos combat
decade mechanic clunky offline control rpg graphic music story
character sidequests video soundtrack combat story dlc
crash hair dialogue
wild hunt generation story character adventure geralt stoic exterior sarcastic
story graphic brilliant
story job withers excellent path
future
fnv
erotic scene hentai sad fantasy excellent idea
beautiful music easy hard master mechcanics graphic story
choice
feel
slav rpg northern
test
storyline console user interface keyboard mouse rpgs comparison
desperate attention

zelda legend princess
combat movement mechanic awsome wont story major scene graphic conversation
doubt series story amazing graphic sound notch true price
actu

dialogue music graphic texture mod mimic movement alive believable movie
control
song hearing witcher line
rich protagonist shoe story rpg
sink hole absolute rpgs
hangover week graphic audio character soo rpg gwent
sound visuals story rpg element design excellent
shelf
wonderful story
masterpiece
rpg action level
quest
recent failure genuine immersive devs company close fantasize story eternal
sid meier alpha videogame centauri
multiple variety story everyday
stop nsfw content fantasy witcher rpg
beautiful story solid graphic
xbox switch expansion pack dlcs scrolling
plethera cup story tea
decent outstanding main reccomend weak soundtrack couple violin bass music
design combat decent step unbalanced enemy middle portion exploration quest
gorgeous road simple entire dlc enormous enviroment breath step quest
content covid dlc party buck
sorrow hapiness anger compassion list masterpiece cheer
company cyberpunk imagine
gawkgawk witcher choose honest
cyberpunk
content main massive story
adv

hell story exceptional music character order mystery background industry excellent
dlc blood wine main brilliant honest
quest
rpg playing history
witcher control
switch gog goty cheap content
beautiful plot character ummmmmm enemy
wild hunt
buy cheap
spad dachu potr umy aby dotrze geralta warte otka polecenia
separate playthroughs wonderment leniency decision brilliant consumables shame release rpgs
addictive expansion
rpg depth wealth action story massive modern absolute masterclass interactive
projekt series creation
decade fence
hand ball fact witcher mechanic
action complaint rpg enjoyment
gfood buy dollats aaaaaaaaaaaaaaaaaaaaaaaaaa
gruffly witch slice fucker
depth recap doubt money
story graphic beautiful fan combat polar opposite single morrowind
onwards final dlc incredible rest skelligar hype
lead rpg lover choice main dlcs gwent openworld masterpiece single
story line fellow witcher vesimir gripe geralt
tiny masterpiece
dlc main character masterpiece witcher
visit novigrad pa

scenery building quest character memorable spent recommend dlcs heart blood
underage albino lady geralds information chore kid member monster power
character emotional level angry standard bar sad geralt expectation
fantasy story european polished rockstar
graphic story beautiful line
immersive beautiful
story line massive clear map
family
horrible bland unplayable imo story combat video
party wither
graphic mod scene cinematic combat believable manageable witcher order story
truth period lady goddammit gentleman
dark fantasy rpg recent
scenario adult manichean quest memorable fluid hell project balance narrative
sword feature
content tip depth gwent
line minor bug specific area gwent story card
story video
england account friend rock
rpg cent
aspect rich story peaceful type easy mod improvement skyrim
marry gentleman omg
true adventure dlcs heart stone blood wine penny unforgettable heck
witcher minus bug lag story short heart stone wine blood
box treat rough
expansion minor bug refer

isle period damn rope armor diamond search cdprojekt red impossible
story graphic excellent
incredible collect package knowledge base deep learn master relic weapon
graphic story nemesis twig level doorjams
obsession story questlines rich beauty aspect dlcs fantastic merigold skyrim
sexy clothes hot optional
taste library choice cent story graphic combat witcher dlc
component upgrade blood wine dlc content price multiple decade
edition sale standard
predictible incentive map quicky boring sale
line feature story horse
decision
girl gonna replay pinnacle
mouse keyboard bos wireless space week midterm character imagine complete
dead mate
soo story soundtrack
rich immersive
sad witcher story deep amazing journey favourite complain
hope future
price steal content story general plot twist decision effect ridiculous
feel skyrim
era excellent story beautiful combat exploration golden character
combat hard attention story hype
shear number level narrative main graphic rpg age quest
true master

obscure dialog goofy beat easter egg series xbox free understand
order witcher fantastic series
impact couple gorgeous city weather effect animation major combat story
story
story unique graphic
story competition visual
possibility endless singleplayer
action plenty quest single
geralt mom
graphic
adjust hate
story design triss sound immersive screen
combat dodge map isnt positive dialogue sand paper plot hole
live save choice
buying regret book
relive playable cyberpunk
main story beautiful imo
potion oil easy npc brawl unique combat varies monster fresh
combat setup gen graphic interesting ingredient character story edition monster
pseudo realistic vacuous spectrum network suburban complex cross disciplinary resistive
money
rpg
mon situation main banker arm awful pick box tree shame
hand texture mod brand modern rpg release
hang control mechanic combat understatement story
nudity watch start
break skyrim clone art group wide attention gamers fact tutorial
monotonous griffin
question 

reality beautiful exe wall eargasm earrape decent paint audio spare
family reunion jealous cyberjunk
xbox graphic
forbidden
achievement sale quiet graphic cheap
book series replay chance choice romantic build feel positive impression
flaw unbearable
tricky begin rpg hang
videogame story gem
story graphic mechanic
combat project story company
havent attention span hmmm shite
solid voice quest gwent mini presentation guess
virtue rich tight narration pitch myriad choice bevy layer emotion
positive state dlc content trough act uninstall release adventure half
half book favourite series
masterpiece quest cutscenes skip treasure absolutel skellige
dlc
visuals cyberpunk flop story project
content
covids wild wizrds witch half lovin bhop poopoo peepee prob
boring hell holy movie
rpg witcher
action emotion masterpiece witcher humongous cdpr
roam story mode familiar tutorial graphic free difficulty
texture encounter slay random mob action playtime gwent card standard
trouble ciri gwent
beste ic

In [429]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
countvec = CountVectorizer(binary=False)
countv = countvec.fit_transform(review_keywords)
print(countv.shape)
print(np.sum(countv.T, axis=1).T.shape)
countmat = np.sum(countv.T, axis=1)
# .toarray()
# print(countv)
l = list()
for i in range(len(countvec.get_feature_names())):
#     print(countvec.get_feature_names()[i])
#     print(countmat[i,0])
    if countmat[i,0]>1:
        l.append((countvec.get_feature_names()[i], countmat[i,0]))
        
# print(l)
count = np.array(np.sum(countv.T, axis=1))

count = np.squeeze(count)
print(count.shape)

df = pd.DataFrame({
    'word': countvec.get_feature_names(),
    'count': count
})
df = df[df['count']>1]
df = df.sort_values(by=["count"],ascending=False)
print(list(df[:100]['word']))


# pdf = pd.DataFrame(np.sum(countv.T, axis=1),index=countvec.get_feature_names(),columns=['count'])
# pdf = pdf[pdf['count']>1]
# pdf = pdf.sort_values(by=["count"],ascending=False)
# print(pdf[:50])
# print(pdf[pdf.index.str.startswith('witch')])

(1796, 2555)
(1, 2555)
(2555,)
['story', 'graphic', 'rpg', 'character', 'combat', 'quest', 'witcher', 'main', 'beautiful', 'dlc', 'masterpiece', 'line', 'content', 'mechanic', 'choice', 'sale', 'immersive', 'dlcs', 'easy', 'geralt', 'gwent', 'series', 'fantastic', 'cyberpunk', 'monster', 'music', 'book', 'amazing', 'rpgs', 'excellent', 'action', 'level', 'bug', 'expansion', 'incredible', 'mod', 'skyrim', 'rich', 'fantasy', 'money', 'single', 'price', 'card', 'feel', 'wine', 'hard', 'hunt', 'video', 'mission', 'heart', 'dialogue', 'blood', 'horse', 'wild', 'storyline', 'edition', 'control', 'sword', 'adventure', 'decent', 'depth', 'plot', 'decision', 'voice', 'multiple', 'soundtrack', 'matter', 'favourite', 'difficulty', 'ive', 'entire', 'design', 'fan', 'base', 'place', 'audio', 'map', 'release', 'interesting', 'buy', 'contract', 'attention', 'stone', 'free', 'start', 'dark', 'enemy', 'true', 'vast', 'solid', 'reality', 'standard', 'decade', 'deep', 'wonderful', 'development', 'fact', 

## TF-IDF

In [456]:
from sklearn.feature_extraction.text import TfidfVectorizer
# tfvec = TfidfVectorizer(smooth_idf=True,max_df=1.0,use_idf=True, sublinear_tf=True)
print(reviews[0])
print(transformed_reviews[0])
tfvec = TfidfVectorizer(smooth_idf=True,max_df=1.0,use_idf=True, sublinear_tf=True)
fv_tfidf = tfvec.fit_transform(review_keywords)
fv_tfidf = fv_tfidf[0]

print(np.squeeze(np.array(fv_tfidf.T.todense())).shape)
print(fv_tfidf.T.todense())
print(np.squeeze(np.array(fv_tfidf.T.todense())))

count_df = pd.DataFrame({
    'word': tfvec.get_feature_names(),
    'tfidf': np.squeeze(np.array(fv_tfidf.T.todense()))
})
count_df = count_df[count_df['tfidf']!=0]
count_df = count_df.sort_values(by=["tfidf"],ascending=False)[:20]
print(count_df)

pdf = pd.DataFrame(fv_tfidf.T.todense(),index=tfvec.get_feature_names(),columns=['tfidf'])
pdf = pdf[pdf['tfidf']!=0]
print(pdf.sort_values(by=["tfidf"],ascending=False)[:20])

the open world really fun alot this game
story line beautiful scenery planning development master piece
(2555,)
[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
[0. 0. 0. ... 0. 0. 0.]
             word     tfidf
1686     planning  0.494289
1960      scenery  0.406447
1365       master  0.398984
1674        piece  0.392307
573   development  0.355063
1299         line  0.269006
182     beautiful  0.242713
2158        story  0.141575
                tfidf
planning     0.494289
scenery      0.406447
master       0.398984
piece        0.392307
development  0.355063
line         0.269006
beautiful    0.242713
story        0.141575


In [301]:
tfidf_result

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [486]:
from collections import Counter

def is_noun(word):
    word_pos = nltk.pos_tag([word])
    if word_pos[0][1][:2] == 'NN':
        return True
    else:
        return False

info_path = '/Users/viviancai/Desktop/cs4300/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/data/reviews/info/'
keywords = list()
keyphrases = list()
for filename in os.listdir(info_path):
    if filename.endswith('.json'):
        with open(info_path+filename, 'r', encoding='utf8') as in_json_file:
            info = json.load(in_json_file)
            keywords.append(" ".join(info['keywords']))
#             keywords.extend(info['keywords'])
            keyphrases.extend(info['keyphrases'])
counter = Counter(keyphrases)
counter = {x: count for x,count in counter.items()}

# keywords = " ".join(keywords)

tfidf_vec = TfidfVectorizer(smooth_idf=True,max_df=0.9,use_idf=True, sublinear_tf=True)
tfidf = tfidf_vec.fit_transform(keywords)
tfidf = tfidf[0]

tfidf_df = pd.DataFrame({
    'word': tfidf_vec.get_feature_names(),
    'tfidf': np.squeeze(np.array(tfidf.T.todense()))
})
tfidf_df = tfidf_df[tfidf_df['tfidf']!=0]
tfidf_df = tfidf_df.sort_values(by=['tfidf'], ascending=False)
print(list(tfidf_df['word']))

count_vec = CountVectorizer(binary=True)
count = count_vec.fit_transform(keywords)
count = np.squeeze(np.array(np.sum(count.T, axis=1)))

def filter_fn(row):
    if is_noun(row['word']) or row['count'] > 5:
        return True
    else:
        return False
    
count_df = pd.DataFrame({
    'word': count_vec.get_feature_names(),
    'count': count
})
count_df = count_df[count_df['count']>1]
m = count_df.apply(filter_fn, axis=1)
count_df = count_df[m]
count_df = count_df.sort_values(by=['count'], ascending=False)
print(list(count_df[m]['word']))

# print(count_df[m][:20])
    
# keywords.update(set(count_df[:100]['word']))

# print(len(counter))
# counter = {x: count for x,count in counter.items() if is_noun(x)}
# print(len(counter))
print(counter)



['zombie', 'russian', 'custom', 'dead', 'deathmatch', 'lan', 'nostalgic', 'dust', 'jailbreak', 'escape', 'realistic', 'counter', 'internet', 'practice', 'headshot', 'head', 'global', 'pack', 'gold', 'offensive', 'csgo', 'skin', 'office', 'spray', 'active', 'young', 'aim', 'alive', 'awp', 'match', 'bhop', 'bot', 'laptop', 'buy', 'casual', 'team', 'cheater', 'childhood', 'surf', 'strike', 'competitive', 'hacker', 'nostalgia', 'skill', 'online', 'playing', 'potato', 'quality', 'school', 'shoot', 'modern', 'texture', 'tho', 'toxic', 'movement', 'account', 'kid', 'gmod', 'download', 'box', 'gamemodes', 'garry', 'memory', 'age', 'model', 'community', 'cheap', 'friend', 'funny', 'weapon', 'shooter', 'beautiful', 'action', 'multiplayer', 'recommend', 'valve', 'source', 'simple', 'base', 'classic', 'decent', 'video', 'sale', 'engine', 'mode', 'physic', 'gun', 'half', 'series', 'control', 'content', 'fan', 'free', 'price', 'hard', 'map', 'mechanic', 'mod']
['money', 'easy', 'graphic', 'map', 'fa

  print(list(count_df[m]['word']))
