In [132]:
import json
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer

nltk.download('wordnet_ic')
brown_ic = wordnet_ic.ic('ic-brown.dat')
from itertools import product

import os
import itertools
import re

from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

import numpy as np

[nltk_data] Downloading package wordnet_ic to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!


# Match game with game keywords

In [37]:

with open('keyphrases_and_keywords.json', 'r', encoding='utf8') as in_json_file:
    common = json.load(in_json_file)


with open('keywords_count_dict.json', 'r', encoding='utf8') as in_json_file:
    word_count = json.load(in_json_file)

with open('../movieData/inv_movie_genre.json', 'r', encoding='utf8') as aa:
    aa = json

In [200]:
def clean_syns(syn):
    syn = syn.lower()
    cleaned = re.sub('[^A-Za-z]+', ' ', syn).strip()
    if cleaned!=syn:
        return []
    split = cleaned.split()
    split = list(filter(lambda x: len(x)>2, split))
    return split

def get_syns(word):
    res = set()
    syns = wn.synsets(word, pos=wn.NOUN)
    syns.extend(wn.synsets(word, pos=wn.ADJ))
    for sense in syns:
        if sense.pos()=='s':
            sense = sense.similar_tos()[0]
#         print(sense.lemma_names())
        cleaned_syns = list(map(clean_syns, sense.lemma_names()))
        cleaned_syns = list(itertools.chain(*cleaned_syns))
        res.update(cleaned_syns)
        
#         # Derived forms
#         derived = [n.name() for l in sense.lemmas() for n in l.derivationally_related_forms()]
#         derived = list(map(clean_syns, derived))
#         derived = list(itertools.chain(*derived))
#         res.update()
    if word in res:
        res.remove(word)
    return list(res)

    
def should_keep(word):
    if word not in word_count.keys():
        return False
    
    (word,pos) = nltk.pos_tag([word])[0]
    if pos[:2]=='NN' or pos[:2]=='JJ':
        return True
    return False

def add_weight(d):
    for word in list(d):
        count = d[word]
        (word,pos) = nltk.pos_tag([word])[0]
        if pos[:2]=='NN':
            d[word] = d[word]*2
            pos = 'n'
        elif pos[:2]=='JJ':
            d[word] = d[word]*1.5
            pos = 'a'
        
        derived_set = set()
        
        syns = wn.synsets(word, pos=pos)
        if pos=='n':
            for sense in syns:
                if sense.pos() == 's':
                    sense = sense.similar_tos()[0]
                derived = [n.name() for l in sense.lemmas() for n in l.derivationally_related_forms()]
                derived = list(map(clean_syns, derived))
                derived = list(itertools.chain(*derived))
                derived = list(filter(lambda x: nltk.pos_tag([x])[0][1][:2]=='JJ', derived))                
                derived_set.update(derived)

            for derived in derived_set:
                if derived not in d.keys():
                    d[derived] = d[word]*0.9
    return d

def process_info(info):
    lst = list(info['keywords'])
    lst = list(filter(should_keep, lst))
    
    ctr = add_weight(dict(Counter(lst)))
#     print(ctr)
    return ctr
    
    
# keyword -> list of keywords and their synonyms
# keyphrase -> break down to words and for each word, check if its synonym in word_to_synphrase

In [196]:
info_path = 'info/'

appid_to_vec = dict()
count = 0
for filename in os.listdir(info_path):
    if filename.endswith('.json'):
        with open(info_path+filename, 'r', encoding='utf8') as in_json_file:
            info = json.load(in_json_file)
            app_id = info['app_id']
            appid_to_vec[app_id] = {'app_id':app_id, 'vector':process_info(info)}
    
    count+=1
    if count%100==0:
        print(count)
    if count == 1:
        print(appid_to_vec)
#         break


{'597220': {'app_id': '597220', 'vector': {'actual': 1.5, 'solid': 1.5, 'beautiful': 2, 'masterpiece': 2, 'funny': 2, 'sale': 2, 'witty': 2, 'dlc': 2, 'class': 2, 'west': 2, 'text': 2, 'achievement': 2, 'puzzle': 2, 'list': 2, 'pun': 2, 'skill': 2, 'simplistic': 1.5, 'mechanic': 2, 'sort': 2, 'hilarious': 1.5, 'joke': 2, 'skeleton': 2, 'horse': 2, 'video': 2, 'free': 1.5, 'goofy': 2, 'choice': 2, 'comedy': 2, 'laugh': 2, 'replayability': 2, 'kingdom': 2, 'location': 2, 'element': 2, 'adventure': 2, 'dialogue': 2, 'fantastic': 1.5, 'humor': 2, 'price': 2, 'multiple': 2, 'combat': 2, 'level': 2, 'casual': 1.5, 'short': 1.5, 'number': 2, 'devs': 2, 'average': 1.5, 'type': 2, 'single': 1.5, 'general': 1.5, 'story': 2, 'white': 1.5, 'stop': 2, 'western': 1.5, 'meat': 2, 'wild': 2, 'talk': 2, 'cow': 2, 'fight': 2, 'graphic': 1.5, 'quest': 2, 'black': 1.5, 'money': 2, 'rpg': 2, 'weird': 2, 'light': 2, 'web': 2, 'bean': 2, 'wacky': 2, 'content': 2, 'hard': 1.5, 'art': 2, 'clever': 2, 'figure':

In [228]:
with open('phrase_word_to_synphrase.json', 'r', encoding='utf8') as in_json_file:
    word_to_synphrase = json.load(in_json_file)

with open('inverse_keyword_phrases.json', 'r', encoding='utf8') as in_json_file:
    inverse = json.load(in_json_file)



In [235]:
# input_tags = ['adventure','great story','graphic','open world','soundtrack']
# input_tags = ['romance','love story','graphic','open world','soundtrack']
input_tags = ['game', 'world-war', 'geralt canon', 'open world', 'friends']

def clean_input(tags):
    lemmatizer = WordNetLemmatizer()
    def helper(tag):
        tag = tag.lower()
        tag = re.sub('[^A-Za-z]+', ' ', tag).strip()
        if len(tag.split())==1:
            (tag,pos) = nltk.pos_tag([tag])[0]
            if pos == 'n' or pos == 'a':
                tag = lemmatizer.lemmatize(tag, pos)
            else:
                tag = lemmatizer.lemmatize(tag, 'n')
        return tag
    tags = list(map(helper, tags))
    
    stopwords = ['game','play','video']
    tags = list(filter(lambda x:x not in stopwords, tags))
    return tags
    

def process_input_match_phrase(tags):
    res = dict()
    
    phrases = list(filter(lambda x: len(x.split())>1, tags))
    
    words = list(map(lambda x: x.split(), input_tags))
    words = list(itertools.chain(*tags))
    
    word_to_tag = dict()
    for word in words:
        for tag in tags:
            if word in tag:
                t = word_to_tag.get(word, list())
                t.append(tag)
                word_to_tag[word] = t
    
    for phrase in phrases:
        if phrase in common['keyphrases']:
            res[phrase] = 3
    
    phrases.remove(phrase)
    
    for word in words:
        (word,pos) = nltk.pos_tag([word])[0]
        if pos=='n':
            synphrases = word_to_synphrase.get(word, list())
            for synphrase in synphrases:
                res[word_to_tag[word]] = 1.5
    
    game_to_score = dict()
    game_to_phrases = dict()
    for k in res.keys():
        games = inverse.get(k, list())
        for game in games:
            score = game_to_score.get(game, 0)
            score += res[k]
            phrases = game_to_phrases.get(game, set())
            phrases.add(k)
            game_to_score[game] = score
            game_to_phrases[game] = phrases
    
    final = {game: (game_to_score[game], game_to_phrases[game]) for game in game_to_score.keys()}
    final = [(game, score_phrases) for (game, score_phrases) in final.items()]
    final = sorted(final, key=lambda x:(-int(x[1][0]), -len(x[1][1]), x[0]))
    return final

input_tags = clean_input(input_tags)
print(process_input_match_phrase(input_tags))
    
    
    
def process_input(tags):
    tags = list(map(lambda x: x.split(), input_tags))
    tags = list(itertools.chain(*tags))
    
    # Get syns for each tag
    tag_to_syns = dict()
    for tag in tags:
        syns = tag_to_syns.get(tag, list())
        syns.extend(get_syns(tag))
        tag_to_syns[tag] = syns
    
    # Get all the words to put in the final vector
    allwords = list(tags)
    for syns in tag_to_syns.values():
        allwords.extend(syns)
    
    ctr = Counter(allwords)
    ctr = dict(ctr)
    
    # Adjust weight
    for word in set(tags):
        count = ctr[word]
        (word,pos) = nltk.pos_tag([word])[0]
        if pos[:2]=='NN':
            if word in tags:
                ctr[word] = ctr[word]*2
            else:
                ctr[word] = ctr[word]*1.75
            pos = 'n'
        elif pos[:2]=='JJ':
            if word in tags:
                ctr[word] = ctr[word]*1.75
            else:
                ctr[word] = ctr[word]*1.5
            pos = 'a'
        else:
            continue
        
        derived_set = set()
        
        syns = wn.synsets(word, pos=pos)
        if pos=='n':
            for sense in syns:
                derived = [n.name() for l in sense.lemmas() for n in l.derivationally_related_forms()]
                derived = list(map(clean_syns, derived))
                derived = list(itertools.chain(*derived))
                derived = list(filter(lambda x: nltk.pos_tag([x])[0][1][:2]=='JJ', derived))                
                derived_set.update(derived)

            for derived in derived_set:
                if derived not in ctr.keys():
                    ctr[derived] = ctr[word]*0.9
    

    return ctr
    
input_vector = process_input(input_tags)
input_vector

# lemmatizer = WordNetLemmatizer()
# lemmatizer.lemmatize("romantic")

# stemmer  = SnowballStemmer('english')
# stemmer.stem('romantic')

# stemmer = PorterStemmer()
# stemmer.stem('romance')

[('226700', (6, {'open world', 'world war'})), ('312660', (6, {'open world', 'world war'})), ('378610', (6, {'open world', 'world war'})), ('50130', (6, {'open world', 'world war'})), ('10090', (3, {'world war'})), ('10150', (3, {'open world'})), ('102500', (3, {'open world'})), ('1030830', (3, {'open world'})), ('1030840', (3, {'open world'})), ('1055540', (3, {'open world'})), ('105600', (3, {'open world'})), ('1056640', (3, {'open world'})), ('1056960', (3, {'open world'})), ('107410', (3, {'open world'})), ('1085660', (3, {'open world'})), ('108710', (3, {'open world'})), ('108800', (3, {'open world'})), ('1097840', (3, {'open world'})), ('110800', (3, {'open world'})), ('1116960', (3, {'open world'})), ('1121560', (3, {'open world'})), ('1128000', (3, {'open world'})), ('113400', (3, {'open world'})), ('11450', (3, {'open world'})), ('1151340', (3, {'open world'})), ('1151640', (3, {'open world'})), ('115320', (3, {'open world'})), ('1172380', (3, {'open world'})), ('1172620', (3,

{'world': 4,
 'war': 2,
 'geralt': 2,
 'canon': 2,
 'open': 1.75,
 'friend': 2,
 'reality': 2,
 'cosmos': 2,
 'macrocosm': 2,
 'international': 2,
 'earth': 2,
 'humans': 2,
 'man': 2,
 'domain': 2,
 'globe': 2,
 'humankind': 2,
 'public': 3,
 'creation': 2,
 'populace': 2,
 'universe': 2,
 'mankind': 2,
 'humanity': 2,
 'existence': 2,
 'warfare': 1,
 'canyon': 1,
 'outdoors': 1,
 'overt': 1,
 'unrestricted': 1,
 'harsh': 1,
 'unfastened': 1,
 'unsettled': 1,
 'active': 1,
 'unobstructed': 1,
 'artless': 1,
 'surface': 1,
 'clear': 1,
 'unprotected': 1,
 'vulnerable': 1,
 'available': 1,
 'unsealed': 1,
 'unconstricted': 1,
 'unenclosed': 1,
 'nonunion': 1,
 'opened': 1,
 'ingenuous': 1,
 'coarse': 1,
 'receptive': 1,
 'susceptible': 1,
 'quaker': 1,
 'supporter': 1,
 'acquaintance': 1,
 'admirer': 1,
 'champion': 1,
 'protagonist': 1,
 'booster': 1,
 'ally': 1,
 'global': 3.6,
 'canonical': 1.8}

In [205]:
def cos_sim(d1, d2):
    intersect = set(d1.keys()).intersection(set(d2.keys()))
    score = sum(d1[k]*d2[k] for k in intersect)
    return (score, intersect)

cos_sim(input_vector, appid_to_vec['47810']['vector'])
# print(input_vector)
# print(appid_to_vec['47810']['vector'])

res = list(map(lambda x: (x['app_id'], cos_sim(input_vector, x['vector'])), appid_to_vec.values()))
res = sorted(res, key=lambda x:(-x[1][0], x[0]))
res

[('241930', (13.24, {'earth', 'global', 'reality', 'universe', 'war'})),
 ('246620', (11.24, {'creation', 'earth', 'global', 'humanity', 'reality'})),
 ('34900', (11.24, {'global', 'humanity', 'reality', 'war'})),
 ('356190', (11.24, {'earth', 'global', 'universe', 'war'})),
 ('394510', (11.24, {'earth', 'global', 'universe', 'war'})),
 ('65980', (11.24, {'earth', 'global', 'humanity', 'war'})),
 ('792220', (10.7, {'earth', 'global', 'universe', 'war'})),
 ('589290', (10, {'existence', 'public', 'war', 'warfare'})),
 ('1007040', (9.24, {'earth', 'global', 'war'})),
 ('102600', (9.24, {'earth', 'global', 'war'})),
 ('200510', (9.24, {'earth', 'global', 'war'})),
 ('214510', (9.24, {'earth', 'global', 'war'})),
 ('238010', (9.24, {'earth', 'global', 'mankind', 'universe'})),
 ('260230', (9.24, {'earth', 'global', 'war'})),
 ('287450', (9.24, {'earth', 'global', 'war'})),
 ('323470', (9.24, {'creation', 'earth', 'global', 'universe'})),
 ('50620', (9.24, {'earth', 'global', 'war'})),
 ('8

# Match keyphrase directly

In [207]:
with open('phrase_word_to_synphrase.json', 'r', encoding='utf8') as in_json_file:
    word_to_synphrase = json.load(in_json_file)


In [211]:
# input_tags = ['adventure','great story','graphic','open world','soundtrack']
# input_tags = ['romance','love story','graphic','open world','soundtrack']
input_tags = ['game', 'world war', 'great story', 'open world']

input_phrase = list(filter(lambda x: len(x.split())>1, input_tags))

input_phrase

['world war', 'great story']