In [1]:
import json
import re
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import defaultdict

In [2]:
pattern = re.compile(r'[\w,.:\']+')

In [3]:
with open("../data/movie_data/movie_id_to_info.json") as f:
    movie_info = json.load(f) # dict of movie_id to all its information
dict_moviename_to_id = {movie_info[x]['name']:x for x in movie_info} 
# dict of movie name to movie id (original id like "0814255", "1017776-rocky")

In [4]:
movie_names = [movie_info[x]['name'] for x in movie_info] # all original movie names
movie_names_lower = [x.lower() for x in movie_names] # all movie names to the lowercase for further process
print(len(movie_names))
dict_moviename_to_idx = {x:i for i,x in enumerate(movie_names_lower)} # dict of movie name to index (integer)
dict_movieidx_to_id = {i:dict_moviename_to_id[movie_names[i]] for i in range(len(movie_names))}

17712


In [5]:
# Testing the dicts and conversions
print(1, movie_names[1], dict_movieidx_to_id[1], sep=', ')

1, Please Give, 0878835


In [6]:
movie_names_tokens = [re.findall(pattern, s) for s in movie_names_lower] # all movie names tokenized
movie_names_tokens = [list(set(s)) for s in movie_names_tokens] # deduplication

In [7]:
mvt_inv_index = defaultdict(list) # movie_names_tokens_inverted_index

for i in range(len(movie_names_tokens)):
    for term in movie_names_tokens[i]:
        if term not in ENGLISH_STOP_WORDS:
            mvt_inv_index[term].append(i)
            
mvt_token_list = list(mvt_inv_index.keys()) # list of all tokens occurring in the movie name
print(len(mvt_token_list))

13889


## Selecting the most similar movie names in the database based on what the user types in as the movie name:

In [8]:
def Extract_Backup_Movie(query, mvt_inv_index):
    """
    extract the indices of movie with at least one same word as the query
    query: name of a movie
    """
    ret = []
    qtokens = list(set(re.findall(pattern, query.lower())))
    for term in qtokens:
        if term in mvt_token_list:
            ret.extend(mvt_inv_index[term])
    return list(set(ret))

def Edit_Distance(query, mv_name):
    """
    return the edit distance between query and mv_name
    both are sorted alphabetically in terms of the tokens before computation
    """
    qtokens = list(set(sorted(re.findall(pattern, query.lower()))))
    mvtokens = list(set(sorted(re.findall(pattern, mv_name.lower()))))
    re_query = ''.join(qtokens)
    re_mv = ''.join(mvtokens)
    x = len(re_query)
    y = len(re_mv)
    
    mat = np.zeros((x+1, y+1))
    mat[0] = list(range(y+1))
    mat[:,0] = list(range(x+1))
    
    for i in range(1, x+1):
        for j in range(1, y+1):
            left = mat[i, j-1] + 1
            up = mat[i-1, j] + 1
            sub = mat[i-1, j-1] + (2 if re_query[i-1] != re_mv[j-1] else 0)
            arr = np.array([left, up, sub])
            mat[i, j] = np.min(arr)
    return mat[x, y]
            
def Most_Similar_Movies(query, mvt_inv_index=mvt_inv_index, movie_names=movie_names, k=5):
    """
    select the backup movies with at least one identical token
    return the kth movie (idx, score, mv_name) with smallest edit distance
    """
    backup_movie_idx = Extract_Backup_Movie(query, mvt_inv_index)
    scores = []
    for i in backup_movie_idx:
        score = Edit_Distance(query, movie_names[i])
        scores.append((i, score, movie_names[i]))
    scores.sort(key=lambda x:x[1])
    return scores[:k]

In [9]:
Edit_Distance('try it', 'one trial')

7.0

In [10]:
Most_Similar_Movies("rescue hostage")

[(1464, 6.0, 'Hostage'),
 (12438, 11.0, 'Rescue Dawn'),
 (6734, 17.0, 'Free Willy 3: The Rescue'),
 (11880, 19.0, 'Planes: Fire And Rescue'),
 (3970, 20.0, 'Brave Little Toaster to the Rescue')]

## Movie-Game Matching

In [13]:
with open("../data/steamData/80k_data/inv_game_genre.json") as f:
    game_genre_inv_idx = json.load(f)
with open("../data/steamData/80k_data/inv_game_tags.json") as f:
    game_tags_inv_idx = json.load(f)
with open("../data/movie_data/inv_movie_genre.json") as f:
    movie_genre_inv_idx = json.load(f)
with open("../data/steamData/80k_data/inv_game_num_players.json") as f:
    num_players_inv_idx = json.load(f)
with open("../data/movie_data/movie_desc_keywords.json") as f:
    movie_keywords = json.load(f)
with open("../data/reviews/inverse_keyword_phrases.json") as f:
    game_review_inv_idx = json.load(f)
for x in game_review_inv_idx:
    game_review_inv_idx[x] = [int(e) for e in game_review_inv_idx[x]]
    
with open("../data/steamData/80k_data/inv_game_desc_keywords.json") as f:
    game_desc_inv_idx = json.load(f)
with open("../data/steamData/80k_data/id_to_info.json") as f:
    game_info = json.load(f)
dict_game_id_to_name = {int(x):game_info[x]['name'] for x in game_info}
dict_game_id_to_rating = {int(x):game_info[x]['rating'] for x in game_info}

FileNotFoundError: [Errno 2] No such file or directory: '../data/steamData/80k_data/id_to_info.json'

In [14]:
# combine the keywords from the description and review with deduplication
def Combine_Dict(dict1, dict2):
    newdict = dict1.copy()
    for key in dict2.keys():
        if key in newdict:
            newdict[key] = list(set(newdict[key]+dict2[key]))
        else:
            newdict[key] = [x for x in dict2[key]]
    return newdict

game_kw_inv_idx = Combine_Dict(game_review_inv_idx, game_desc_inv_idx) 
game_kw = list(game_kw_inv_idx.keys())

In [15]:
Game_genres = list(game_genre_inv_idx.keys())
Game_genres_extension = ['survival', 'fps', 'puzzle', 'party game', 'casual', 'wargame', 'tower defense']
all_game_genres = Game_genres + Game_genres_extension

In [16]:
Game_genres

['mmo',
 'strategy',
 'action',
 'indie',
 'sports',
 'party-game',
 'role-playing',
 'simulation',
 'open-world',
 'adventure',
 'boardgame']

In [17]:
# mapping game id to the number of genres the game has
dict_game_genre_count = defaultdict(int)
for genre in all_game_genres:
    if genre in Game_genres:
        for entry in game_genre_inv_idx[genre]:
            dict_game_genre_count[entry] += 1
    else:
        for entry in game_tags_inv_idx[genre]:
            dict_game_genre_count[entry] += 1

In [17]:
# mapping game id to the number of keywords the game has (decription + review)
dict_game_kw_count = defaultdict(int)
for kw in game_kw_inv_idx:
    for entry in game_kw_inv_idx[kw]:
        dict_game_kw_count[entry] += 1

In [18]:
def Generic_Game_Filtering(players=None, genres=None, mv_idx=None, free_query=None, k=15):
    
    scores = defaultdict(int)
    # filtering based on the 'single','multi' button, or unspecified/both, filtering out the range
    if players == 'single':
        num_pool = num_players_inv_idx['single-player']
    # players == 'multi'
    elif players == 'multi':
        num_pool = num_players_inv_idx['multi-player']
    for entry in num_pool:
        num_pool = None
    # not selected
    
    # filtering based on selected genres, scored for each genre matching
    if genres:
        for genre in genres:
            if genre in Game_genres:
                genre_pool = game_genre_inv_idx[genre]
            # gere in extensions
            else:
                genre_pool = game_tags_inv_idx[genre]
            for entry in genre_pool:
                if (num_pool and entry in num_pool) or (not num_pool):
                    scores[entry] += 10 / dict_game_genre_count[entry]
    
    # filtering based on the exact movie name in the database, selected above
    # using the movie name + genre + keywords from the description as keywords
    # each keyword is scored 0.1
    if mv_idx:
        mv_id = dict_movieidx_to_id[mv_idx]
        mv_keywords = movie_keywords[mv_id]
        mv_name = movie_names_lower[mv_idx]
        mv_genre = movie_info[mv_id]['genre']

        mv_keywords += re.findall(pattern, mv_name)
        mv_keywords += [y for x in mv_genre for y in re.findall(pattern, x.lower()) ]
        mv_keywords = list(set(mv_keywords))

        for term in mv_keywords:
            if term in game_kw:
                for entry in game_kw_inv_idx[term]:
                    if (num_pool and entry in num_pool) or (not num_pool):
                        scores[entry] += 80 / dict_game_kw_count[entry]
    
    # filtering on free query, tokenized as keywords, each scored 0.5
    if free_query:
        qtokens = free_query.lower().split()
        for term in qtokens:
            if term in game_kw:
                for entry in game_kw_inv_idx[term]:
                    if (num_pool and entry in num_pool) or (not num_pool):
                        scores[entry] += 150 / dict_game_kw_count[entry]
                        
    # factoring in game rating(0-100)
    for x in scores:
         scores[x] += dict_game_id_to_rating[x] / 50
        
    scores_tuple_lst = [(num, dict_game_id_to_name[num], scores[num]) for num in scores]
    scores_tuple_lst.sort(key=lambda x:x[2])
    scores_tuple_lst.reverse()
    return scores_tuple_lst[:k]
    
# movie = "titanic"
Generic_Game_Filtering(players='single', genres=['action', 'role-playing'], mv_idx=12940,
                       free_query='forest monster')

[(627270, 'Injustice 2', 13.493121693121697),
 (550080, 'VALKYRIE DRIVE -BHIKKHUNI-', 13.288627450980393),
 (925750,
  'Dead by Daylight - Shattered Bloodline Chapter',
  12.916470588235294),
 (353370, 'Steam Controller', 12.75),
 (976310, 'Mortal Kombat11', 12.719540229885059),
 (222940, 'THE KING OF FIGHTERS XIII STEAM EDITION', 12.650593607305934),
 (311730, 'DEAD OR ALIVE 5 Last Round: Core Fighters', 12.483977900552487),
 (228180, 'Action - Gameplay Recording and Streaming', 12.41868995633188),
 (1167450, 'DAEMON X MACHINA', 11.987692307692308),
 (1242753,
  "Monster Hunter World: Iceborne - Pendant: Strollin' Paolumu",
  11.98),
 (222440, 'THE KING OF FIGHTERS 2002 UNLIMITED MATCH', 11.98),
 (323130, 'Half-Life Soundtrack', 11.96),
 (920566, "Resident Evil 2 - Claire Costume: 98'", 11.96),
 (1223860, 'Smell of the Game (NEW GUILTY GEAR Promotion Music)', 11.96),
 (1242759,
  'Monster Hunter World: Iceborne - Pendant: Rainbow Balloons',
  11.96)]

In [19]:
dict_moviename_to_idx

{'percy jackson & the olympians: the lightning thief': 0,
 'please give': 1,
 '10': 2,
 '12 angry men (twelve angry men)': 3,
 '20,000 leagues under the sea': 4,
 '10,000 b.c.': 5,
 'the 39 steps': 6,
 '3:10 to yuma': 1966,
 'charly (a heartbeat away)': 8,
 'abraham lincoln': 9,
 'dark water': 5178,
 'the accused': 11,
 'the lost city': 12,
 'the breaking point': 13,
 "adam's rib": 14,
 'the bridge of san luis rey': 15,
 'the prowler (cost of living )': 16,
 'criminal': 4978,
 'the adventures of mark twain': 18,
 'deep blue': 19,
 'the adventures of robin hood': 20,
 'man hunt': 21,
 'dead end': 5254,
 'bandwagon': 23,
 'whore': 24,
 'tristan & isolde': 25,
 'wild side': 26,
 'a tale of two pizzas': 27,
 'running scared': 706,
 'going the distance': 271,
 'ultraviolet': 30,
 'home of the brave': 7832,
 'arthur and the invisibles (arthur and the minimoys)': 32,
 'malevolence': 33,
 'eternal': 34,
 'a farewell to arms': 6303,
 'the narrow margin': 36,
 'the matador': 181,
 'smile': 38,
 