In [1]:
import nltk
import os
import pandas as pd
import numpy as np
import json
import re

In [2]:
TMT_MOVIE_PATH = '/Users/susan/Development/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/data/movie_data/rotten_tomatoes_movies.csv'

with open(TMT_MOVIE_PATH, 'r') as f:
    movie_df = pd.read_csv(f)

In [4]:
movie_df.columns

Index(['rotten_tomatoes_link', 'movie_title', 'movie_info',
       'critics_consensus', 'content_rating', 'genres', 'directors', 'authors',
       'actors', 'original_release_date', 'streaming_release_date', 'runtime',
       'production_company', 'tomatometer_status', 'tomatometer_rating',
       'tomatometer_count', 'audience_status', 'audience_rating',
       'audience_count', 'tomatometer_top_critics_count',
       'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count'],
      dtype='object')

In [94]:
movie_df[movie_df['movie_title'] == 'The LEGO Movie']

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
15365,m/the_lego_movie,The LEGO Movie,"Emmet (Chris Pratt), an ordinary LEGO figurine...","Boasting beautiful animation, a charming voice...",PG,"Action & Adventure, Animation, Comedy, Kids & ...","Phil Lord, Christopher Miller, Chris McKay","Dan Hageman, Roy Lee, Christopher Miller, Phil...","Chris Pratt, Elizabeth Banks, Will Arnett, Mor...",2014-02-07,...,Warner Bros. Pictures,Certified-Fresh,96.0,250.0,Upright,87.0,222764.0,51,240,11


## Get StopWords

In [172]:
all_titles = list(set(movie_df['movie_title']))

In [173]:
# stem titles
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

def lemmatize_titles(titles_lst):
    lemmatizer = WordNetLemmatizer()
    lemma_res = []
#     stemmer = PorterStemmer()
#     stem_res = []
    for w in titles_lst:
        token_lst1 = list(map(lambda x: lemmatizer.lemmatize(x), word_tokenize(w.lower())))
#         token_lst2 = list(map(lambda x: stemmer.stem(x), word_tokenize(w.lower())))
        lemma_res.append(" ".join(token_lst1))
#         stem_res.append(" ".join(token_lst2))
    return lemma_res
    
lemmatize_titles(['Super Heros', "She is prettier", 'Worlds', 'Star Wars'])

['super hero', 'she is prettier', 'world', 'star war']

In [174]:
lemmatize_title('Super Heros')

'super hero'

In [175]:
all_lemma_titles = lemmatize_titles(all_titles)

In [176]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    top_words = nltk.pos_tag([val[0] for val in words_freq])
    
    if not n:
        return top_words
    return top_words[:n]

In [177]:
top_words = get_top_n_words(all_lemma_titles)
print(top_words[:100])

[('the', 'DT'), ('of', 'IN'), ('and', 'CC'), ('in', 'IN'), ('to', 'TO'), ('man', 'NN'), ('love', 'VB'), ('on', 'IN'), ('for', 'IN'), ('my', 'PRP$'), ('life', 'NN'), ('you', 'PRP'), ('day', 'NN'), ('night', 'NN'), ('story', 'NN'), ('girl', 'NN'), ('last', 'JJ'), ('me', 'PRP'), ('with', 'IN'), ('la', 'NN'), ('from', 'IN'), ('is', 'VBZ'), ('it', 'PRP'), ('movie', 'NN'), ('dead', 'JJ'), ('world', 'NN'), ('boy', 'NN'), ('war', 'NN'), ('time', 'NN'), ('de', 'IN'), ('all', 'DT'), ('big', 'JJ'), ('black', 'JJ'), ('one', 'CD'), ('king', 'NN'), ('no', 'DT'), ('ii', 'JJ'), ('american', 'JJ'), ('house', 'NN'), ('le', 'VBD'), ('little', 'JJ'), ('at', 'IN'), ('do', 'VBP'), ('dark', 'JJ'), ('blue', 'VB'), ('white', 'JJ'), ('men', 'NNS'), ('red', 'VBD'), ('an', 'DT'), ('blood', 'NN'), ('woman', 'NN'), ('we', 'PRP'), ('city', 'NN'), ('death', 'NN'), ('up', 'IN'), ('new', 'JJ'), ('good', 'JJ'), ('out', 'RP'), ('who', 'WP'), ('heart', 'NN'), ('lost', 'VBD'), ('go', 'VB'), ('god', 'JJ'), ('dog', 'NN'), ('

In [178]:
def get_stop_words(tagged_top_words, add_all_till=0):
    stop_types = ['DT', 'IN', 'CC', 'TO', 'PRP$', 'PRP', 'WP']
    stop_words = []
    idx = 0
    while idx < len(tagged_top_words):
        word = tagged_top_words[idx][0]
        word_type = tagged_top_words[idx][1]
        if idx < add_all_till:
            stop_words.append(word)
        else:
            if word_type in stop_types:
                stop_words.append(word)      
        idx += 1
#     stop_words = ['story', 'movie','time', 'ii', 'iii', 'iv', 'v', 'le', 'de', 'days', 'man', 'world', 'girl', 'one', 'two']
#     stop_words = [w[0] for w in tagged_top_words if w[1] not in stop_types and w[0] not in stop_words]
    return stop_words

stop_words = get_stop_words(top_words[:700], 30)
print(stop_words)

['the', 'of', 'and', 'in', 'to', 'man', 'love', 'on', 'for', 'my', 'life', 'you', 'day', 'night', 'story', 'girl', 'last', 'me', 'with', 'la', 'from', 'is', 'it', 'movie', 'dead', 'world', 'boy', 'war', 'time', 'de', 'all', 'no', 'at', 'an', 'we', 'up', 'who', 'your', 'by', 'what', 'that', 'this', 'about', 'like', 'angel', 'after', 'beyond', 'under', 'our', 'before', 'or', 'she', 'into', 'they', 'over', 'bride', 'another', 'her', 'so', 'his', 'without', 'if', 'he', 'mrs', 'than', 'blind', 'through', 'between', 'some', 'upon', 'behind', 'them', 'within', 'but', 'every']


In [179]:
for w in top_words:
    if len(w[0]) == 2:
        print(w)

('of', 'IN')
('in', 'IN')
('to', 'TO')
('on', 'IN')
('my', 'PRP$')
('me', 'PRP')
('la', 'NN')
('is', 'VBZ')
('it', 'PRP')
('de', 'IN')
('no', 'DT')
('ii', 'JJ')
('le', 'VBD')
('at', 'IN')
('do', 'VBP')
('an', 'DT')
('we', 'PRP')
('up', 'IN')
('go', 'VB')
('by', 'IN')
('mr', 'FW')
('be', 'VB')
('am', 'VBP')
('el', 'VBP')
('vs', 'NN')
('or', 'CC')
('dr', 'JJ')
('wa', 'JJ')
('re', 'NN')
('so', 'IN')
('du', 'NN')
('di', 'RB')
('da', 'NN')
('un', 'JJ')
('il', 'RB')
('if', 'IN')
('13', 'CD')
('he', 'PRP')
('3d', 'CD')
('ca', 'MD')
('10', 'CD')
('ha', 'NN')
('st', 'NN')
('et', 'VBP')
('ll', 'VBP')
('11', 'CD')
('en', 'FW')
('wo', 'MD')
('au', 'JJ')
('24', 'CD')
('si', 'JJ')
('20', 'CD')
('iv', 'NN')
('30', 'CD')
('12', 'CD')
('ma', 'VBN')
('ve', 'NN')
('oh', 'JJ')
('ho', 'JJ')
('mi', 'NN')
('ne', 'JJ')
('na', 'NNS')
('fu', 'NN')
('pa', 'NN')
('50', 'CD')
('17', 'CD')
('li', 'NN')
('tu', 'NN')
('ta', 'NN')
('te', 'NN')
('ip', 'JJ')
('22', 'CD')
('ai', 'NN')
('21', 'CD')
('ex', 'NN')
('16', 'CD

In [180]:
stop_words =  list(filter(lambda x: x not in ['blind', 'angel', 'bride'], stop_words))
stop_words.append("s")
stop_words.append("ll")
print(stop_words)

['the', 'of', 'and', 'in', 'to', 'man', 'love', 'on', 'for', 'my', 'life', 'you', 'day', 'night', 'story', 'girl', 'last', 'me', 'with', 'la', 'from', 'is', 'it', 'movie', 'dead', 'world', 'boy', 'war', 'time', 'de', 'all', 'no', 'at', 'an', 'we', 'up', 'who', 'your', 'by', 'what', 'that', 'this', 'about', 'like', 'after', 'beyond', 'under', 'our', 'before', 'or', 'she', 'into', 'they', 'over', 'another', 'her', 'so', 'his', 'without', 'if', 'he', 'mrs', 'than', 'through', 'between', 'some', 'upon', 'behind', 'them', 'within', 'but', 'every', 's', 'll']


# Filter Games and Movies with Stop Words

In [309]:
def filter_title(title, stop_words=stop_words):
    filtered_title = re.sub('\W+',' ', title.lower()).strip().split(" ")
    
    lemmatizer = WordNetLemmatizer()
    tokens = list(filter(lambda x: x not in stop_words, filtered_title))
    token_lst1 = list(map(lambda x: lemmatizer.lemmatize(x), tokens))
    print(f'filtered_title = {filtered_title}')
    print(f'tokens = {tokens}')
    print(f'lemmatizer.lemmatize(x) = {token_lst1}')
    return " ".join(token_lst1)

In [317]:
# stemmer = PorterStemmer()
# lemmatizer = WordNetLemmatizer()
# stemmer.stem('flowers'), lemmatizer.lemmatize('flowers')

In [197]:
filtered_title_dict = dict() # <rotten_path, filtered_name>  
inv_filtered_title_dict = dict() # <filtered_name, rotten_path>

count = 0
for idx, row in movie_df.iterrows():
    if type(row['rotten_tomatoes_link']) is str:
        rotten_tmt = row['rotten_tomatoes_link'].split('/')[1]
        filtered_title = filter_title(row['movie_title'], stop_words)
        filtered_title_dict[rotten_tmt] = filtered_title
        if filtered_title not in inv_filtered_title_dict:
            inv_filtered_title_dict[filtered_title] = []
        inv_filtered_title_dict[filtered_title].append(rotten_tmt)

In [198]:
import json

inv_filtered_title = os.path.join('movie_data', 'inv_filtered_title_lyt.json')
filtered_title = os.path.join('movie_data', 'filtered_title_lyt.json')
json.dump(filtered_title_dict, open(filtered_title, 'w+'), indent=4)
json.dump(inv_filtered_title_dict, open(inv_filtered_title, 'w+'), indent=4)

In [199]:
available_games = os.path.join('steamData', 'available_games', 'available_game_filtered_title.csv')
with open(available_games, 'r') as f:
    games_df = pd.read_csv(available_games)

In [200]:
games_df.head(2)

Unnamed: 0.1,Unnamed: 0,name,app_id,filtered_name
0,0,Among Us,945360,among
1,1,Counter-Strike: Global Offensive,730,counter strike global offensive


In [205]:
games_df = games_df[{'name', 'app_id'}]

In [208]:
# movie_data_df['filtered_name'] = movie_data_df.apply(lambda x: get_filtered_mov_name(x), axis=1)

games_df['filtered_name'] = games_df.apply(lambda x: filter_title(x['name'], stop_words), axis=1)

filtered_title = ['assassin', 's', 'creed', 'odyssey']
tokens = ['assassin', 'creed', 'odyssey']


In [209]:
# for idx, row in games_df.iterrows():
#     row['filtered_name'] = filter_title(row['name'], stop_words)

In [210]:
games_df[games_df['app_id'] == 812140]

Unnamed: 0,app_id,name,filtered_name
192,812140,Assassin's Creed® Odyssey,assassin creed odyssey


In [211]:
games_df.to_csv(available_games)

In [304]:
game_filtered_title_dict = dict() # <app_id, filtered_name>  
game_inv_filtered_title_dict = dict() # <filtered_name, app_id>

count = 0
for idx, row in games_df.iterrows():
    app_id = str(row['app_id'])
    filtered_title = row['filtered_name']
    game_filtered_title_dict[app_id] = filtered_title
    if filtered_title not in game_inv_filtered_title_dict:
        game_inv_filtered_title_dict[filtered_title] = dict()
    game_inv_filtered_title_dict[filtered_title] = app_id

In [318]:
game_filtered_title = os.path.join('steamData', 'available_games', 'inv_filtered_title_lyt.json')
game_inv_filtered_title = os.path.join('steamData', 'available_games', 'filtered_title_lyt.json')
json.dump(game_filtered_title, open(game_filtered_title, 'w+'), indent=4)
json.dump(game_inv_filtered_title, open(game_inv_filtered_title, 'w+'), indent=4)

In [319]:
print(os.path.abspath(game_filtered_title))

/Users/susan/Development/cs4300sp2021-cw887-qh75-rz92-yc687-yl698/data/steamData/available_games/inv_filtered_title_lyt.json


## Get Similarity Score

In [218]:
from collections import Counter

def get_similarity_score(val1, val2):
    val_lst1 = val1.strip().split(' ')
    val_lst2 = val2.strip().split(' ')
    
    val_count1 = dict(Counter(val_lst1))
    val_count2 = dict(Counter(val_lst2))
    
    common_keys = set(val_count1.keys()).intersection(set(val_count2.keys()))
    denom = 0
    for k in common_keys:
        denom += min(val_count1[k], val_count2[k]) 
    return denom / len(val_lst2)


print(get_similarity_score('lego', 'lego world   '))

0.5


In [222]:
games_name_list = list(sorted(list(set(games_df['filtered_name']))))
movies_name_list = list(sorted(list(set(inv_filtered_title_dict.keys()))))

In [225]:
def print_top_k(names, scores, k=10):
    top_k_scores = np.argsort(scores)[::-1][:k]
    for idx in top_k_scores:
        print(names[idx], scores[idx])

In [255]:
games_name_sim_score_list = []
for name in games_name_list:
    games_name_sim_score_list.append(get_similarity_score('assassin creed odyssey', name))

In [256]:
print_top_k(games_name_list, games_name_sim_score_list)

assassin creed odyssey 1.0
assassin creed origin 0.6666666666666666
assassin creed brotherhood 0.6666666666666666
assassin creed revelation 0.6666666666666666
assassin creed rogue 0.6666666666666666
assassin creed syndicate 0.6666666666666666
assassin creed unity 0.6666666666666666
assassin creed odyssey fate atlantis 0.6
assassin creed iii remastered 0.5
abyss odyssey 0.5


In [259]:
def get_top_movie_info(score_lst, games_name_list, k=100, filtered_zeros=True):
    res = []
    sorted_idx = np.argsort(score_lst)[::-1]
    if not k:
        k = len(sorted_idx)
    for i in range(k):
        idx = sorted_idx[i]
        if score_lst[idx] > 0:
            res.append([games_name_list[idx], score_lst[idx]])
    return res

In [260]:
get_top_movie_info(games_name_sim_score_list, games_name_list)

[['assassin creed odyssey', 1.0],
 ['assassin creed origin', 0.6666666666666666],
 ['assassin creed brotherhood', 0.6666666666666666],
 ['assassin creed revelation', 0.6666666666666666],
 ['assassin creed rogue', 0.6666666666666666],
 ['assassin creed syndicate', 0.6666666666666666],
 ['assassin creed unity', 0.6666666666666666],
 ['assassin creed odyssey fate atlantis', 0.6],
 ['assassin creed iii remastered', 0.5],
 ['abyss odyssey', 0.5],
 ['assassin creed director cut edition', 0.4],
 ['assassin creed iv black flag', 0.4],
 ['assassin creed origin curse pharaoh', 0.4],
 ['assassin creed 2 deluxe edition', 0.4],
 ['assassin creed origin hidden one', 0.4],
 ['enslaved odyssey west premium edition', 0.2],
 ['witcher 2 assassin king enhanced edition', 0.16666666666666666],
 ['3030 deathwar redux a space odyssey', 0.16666666666666666]]

In [278]:
import time

def get_sim_scores(games_name_list=games_name_list, movies_name_list=movies_name_list):
    sim_json = dict()

    count = 0
    start = time.time()
    for mov_name in movies_name_list:
        count += 1
        if count % 1000 == 0:
            print(f'{count}: {time.time() - start}')
        score_lst = []
        for game_name in games_name_list:
            score_lst.append(get_similarity_score(mov_name, game_name))
        sim_json[mov_name] = get_top_movie_info(score_lst, games_name_list)
    return sim_json

In [279]:
name_sim_dict = get_sim_scores()

1000: 17.889000177383423
2000: 35.41371417045593
3000: 53.06220602989197
4000: 70.48846507072449
5000: 87.98923301696777
6000: 105.4511981010437
7000: 123.35684299468994
8000: 140.9596860408783
9000: 158.57317399978638
10000: 176.10992288589478
11000: 195.1186079978943
12000: 217.07870197296143
13000: 235.6496319770813
14000: 254.09561586380005
15000: 271.8267590999603
16000: 291.6538951396942


In [293]:
name_sim_dict['assassin creed']

[['assassin creed origin', 0.6666666666666666],
 ['assassin creed brotherhood', 0.6666666666666666],
 ['assassin creed odyssey', 0.6666666666666666],
 ['assassin creed revelation', 0.6666666666666666],
 ['assassin creed rogue', 0.6666666666666666],
 ['assassin creed syndicate', 0.6666666666666666],
 ['assassin creed unity', 0.6666666666666666],
 ['assassin creed iii remastered', 0.5],
 ['assassin creed iv black flag', 0.4],
 ['assassin creed odyssey fate atlantis', 0.4],
 ['assassin creed 2 deluxe edition', 0.4],
 ['assassin creed origin curse pharaoh', 0.4],
 ['assassin creed origin hidden one', 0.4],
 ['assassin creed director cut edition', 0.4],
 ['witcher 2 assassin king enhanced edition', 0.16666666666666666]]

In [294]:
json.dump(name_sim_dict, open(os.path.join('steamData', '80k_data', 'movie_game_title_similarity_5.json'),'w+'), indent=4)

In [None]:
inv_filtered_title = os.path.join('movie_data', 'inv_filtered_title_lyt.json')
filtered_title = os.path.join('movie_data', 'filtered_title_lyt.json')
movie_filtered_title = json.load(filtered_title_dict, open(filtered_title, 'r'), indent=4)
movie_inv_filtered_title = json.load(inv_filtered_title_dict, open(inv_filtered_title, 'r'), indent=4)

In [None]:
def convert_names_to_id(cur_json, movie_inv_filtered_title_dict=inv_filtered_title_dict):
    pass