In [5]:
import os
import pandas as pd
import json
import re
import zipfile

In [46]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
# from gensim.parsing.preprocessing import STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import yake

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\grace\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Process game descriptions
- Process list of available games only
- Extract keyphrases in the form of adj-noun
- Extract keyphrases and keywords using Yake

In [24]:
#Unzip
json_dir = os.path.join(os.getcwd(), '80k_data')
zip_file = os.path.join(json_dir, 'id_to_info.zip')
unzipped_file = os.path.join(json_dir, 'id_to_info.json')
if not os.path.exists(unzipped_file):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(json_dir)
        
#load
with open(unzipped_file) as f:
    id_to_info = json.load(f)
with open(os.path.join(json_dir, 'available_games.json')) as f:
    available_games = json.load(f)

#delete unzipped file
if os.path.exists(unzipped_file):
    os.remove(unzipped_file)

In [25]:
id_to_info

{'945360': {'name': 'Among Us',
  'tag': ['top-down',
   'multiplayer',
   'aliens',
   'minigames',
   'local co-op',
   'cartoony',
   'psychological',
   'pvp',
   'online co-op',
   'space',
   'survival',
   'local multiplayer',
   'colorful',
   'cartoon',
   '2d',
   'indie',
   'co-op',
   'casual',
   'sci-fi',
   'logic',
   'funny'],
  'genre': ['action', 'mmo', 'indie'],
  'developer': ['innersloth'],
  'publisher': ['innersloth'],
  'number_of_players': ['multiplayer'],
  'rating': 95,
  'mature_content': True,
  'short_bio': 'An online and local party game of teamwork and betrayal for 4-10 players...in space!',
  'full_bio': "Play with 4-10 player online or via local WiFi as you attempt to prepare your spaceship for departure, but beware as one or more random players among the Crew are Impostors bent on killing everyone!Originally created as a party game, we recommend playing with friends at a LAN party or online using voice chat. Enjoy cross-platform play between Android

In [51]:
stop_words = stopwords.words('english')
stop_words.extend(['play', 'player', 'game', 'large','big','small','good','bad', 'pc', 'android', 'ios', 
                   'lot', 'wifi'])
lemmatizer = WordNetLemmatizer()

def clean_description(description):
    description = re.sub('[^A-Za-z0-9\-_\']+', ' ', description)
    description = ''.join(description)
    return description.lower()

"""

"""
def lemmatize_and_filter(description):
    tokenized = description.split()
    lemmatized_lst = [lemmatizer.lemmatize(w) for w in tokenized]
    filtered = list(filter(lambda x: x not in stop_words, lemmatized_lst))
    return filtered

"""
Extracts all 2-word keyphrases from a description in the form of noun-adj
Returns: the set of noun-adj keyphrases
"""
def create_noun_adj_list(word_lst):
    lemmatized_lst = [lemmatizer.lemmatize(w) for w in word_lst]
    pos = nltk.pos_tag(lemmatized_lst)
    noun_adj_set = set()
    for i in range(len(pos) - 1):
        if (pos[i][1] == 'JJ' and pos[i+1][1] == 'NN'):
            noun_adj_phrase = ' '.join([pos[i][0], pos[i+1][0]])
            noun_adj_set.add(noun_adj_phrase)
    return noun_adj_set

"""
Extracts 5 keyphrases and and 20 keywords using YAKE algo given a description input
Returns: set of keywords and keyphrases
"""
def yake_extraction(description):       
    language = "en"
    deduplication_threshold = 0.9
    
    custom_kw_extractor_2gram = yake.KeywordExtractor(lan=language, n=2,dedupLim=deduplication_threshold, 
                                                      top=5, features=None)
    custom_kw_extractor_single = yake.KeywordExtractor(lan=language, n=1,dedupLim=deduplication_threshold, 
                                                      top=20, features=None)
    
    keyphrase = custom_kw_extractor_2gram.extract_keywords(description)
    keyphrase = sorted(keyphrase, key=lambda item:(item[1]))
    keywords = custom_kw_extractor_single.extract_keywords(description)
    keywords = sorted(keywords, key=lambda item:(item[1]))
    
    all_keywords = keyphrase + keywords
    all_keywords = sorted(all_keywords, key=lambda item:(item[1]))
    keywords_only = [x[0] for x in all_keywords]
    
    return set(keywords_only) 

In [63]:
"""
Creates the tf and idf dictionaries given an input dictionary of game_id->description 
Returns: (tf, idf) tuple where:
            tf = index of game_id->list of keywords
            idf = inverse index of keyword -> list of game_ids
"""
def create_keyword_tf_idf(id_to_desc_dict):
    tf = dict()
    idf = dict()
    for (game, raw_desc) in id_to_desc_dict.items():
        description = clean_description(raw_desc)
        filtered_lst = lemmatize_and_filter(description)
        noun_adj_keyphrase = create_noun_adj_list(filtered_lst)
        filtered_desc = ' '.join(filtered_lst)
        yake_keywords = yake_extraction(filtered_desc)
        all_keywords = noun_adj_keyphrase.union(yake_keywords)
    
        tf[game] = list(all_keywords)
        for word in all_keywords:
            idf.setdefault(word,[]).append(game)
        
    return tf, idf

In [72]:
available_id_to_desc = dict()
for game in available_games:
    available_id_to_desc[game] = id_to_info[str(game)]['full_bio']

available_id_to_desc_subsest = dict()
for game in available_games[:100]:
    available_id_to_desc_subsest[game] = id_to_info[str(game)]['full_bio']

In [73]:
keyword_tf, keyword_idf = create_keyword_tf_idf(available_id_to_desc)

In [76]:
# Write results to json
out_dir = json_dir
with open(os.path.join(out_dir, 'game_description_tf.json'), 'w') as json_file:
    json.dump(keyword_tf, json_file)
with open(os.path.join(out_dir, 'game_description_idf.json'), 'w') as json_file:
    json.dump(keyword_idf, json_file)

In [80]:
print(keyword_idf.keys())

