In [5]:
import psycopg2 as pg2
import pandas as pd
import pprint 
import string
import unicodedata
import numpy as np
import pickle

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import decomposition

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from nltk import pos_tag
from nltk import RegexpParser
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

## Import pickled cleaned data from previous project

In [2]:
with open('../data/boardgamedata.pickle', 'rb') as f:
    worddata, numdata = pickle.load(f)
    
worddata.reset_index(drop=True, inplace=True)

In [3]:
worddata[10:15]

Unnamed: 0,game.id,name,description,category,mechanic
10,10,Elfenland,Elfenland is a redesign of the original White ...,"[Fantasy, Travel]","[Card Drafting, Hand Management, Point to Poin..."
11,12,Ra,Ra is an auction and set-collection game with ...,"[Ancient, Mythology]","[Auction/Bidding, Press Your Luck, Set Collect..."
12,13,Catan,"In Catan (formerly The Settlers of Catan), pla...",[Negotiation],"[Dice Rolling, Hand Management, Modular Board,..."
13,14,Basari,Basari is a game of gem merchants competing in...,[Negotiation],"[Roll, Spin and Move, Set Collection, Simultan..."
14,15,Cosmic Encounter,"By request of Fantasy Flight Games, Board Game...","[Bluffing, Negotiation, Science Fiction]","[Hand Management, Variable Player Powers]"


## Build LDA Model

In [7]:
#define common words in data to remove
#built from pervious project
sw = set(stopwords.words('english'))

COMMON_WORDS = ["game","point","board","player","rule","turn",
                "tile","card","deck","hand","points","victory",
                "win","lose","defeat","quot","players","play"
               ]
sw.update(COMMON_WORDS)

#### Remove numbers from vectorizer

Implement preprocess function

In [9]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [None]:
# use counter to count all words to get idea of common words



In [10]:
count_vectorizer = CountVectorizer(stop_words=sw, preprocessor=preprocess_text, max_df=0.85,
                                  min_df=2, max_features=1000)

In [11]:
word_vec = count_vectorizer.fit_transform(worddata['description'])

In [12]:
lda = LatentDirichletAllocation(learning_method='online', n_jobs=-2, 
                                random_state=1659)
lda.fit(word_vec)

LatentDirichletAllocation(learning_method='online', n_jobs=-2,
                          random_state=1659)

In [13]:
def top_topic_features(model, feature_names, num_features=10):
    sorted_topics = feature_names[model.components_.argsort(axis=1)[:, ::-1][:, :num_features]]
    return sorted_topics

In [16]:
feature_names = np.array(count_vectorizer.get_feature_names())
top_topic_features(lda, feature_names=feature_names)


array([['team', 'city', 'tokens', 'money', 'resources', 'build', 'phase',
        'power', 'building', 'token'],
       ['dice', 'move', 'one', 'roll', 'die', 'first', 'pieces', 'space',
        'two', 'spaces'],
       ['new', 'rules', 'expansion', 'cards', 'set', 'games', 'includes',
        'ship', 'also', 'edition'],
       ['questions', 'answer', 'eacute', 'party', 'question', 'la',
        'guess', 'correct', 'correctly', 'variant'],
       ['de', 'fun', 'treasure', 'monster', 'friends', 'find', 'dark',
        'gold', 'items', 'dungeon'],
       ['time', 'get', 'one', 'make', 'like', 'word', 'many', 'way',
        'also', 'best'],
       ['cards', 'one', 'round', 'two', 'tiles', 'wins', 'number',
        'first', 'played', 'three'],
       ['cards', 'character', 'characters', 'use', 'heroes', 'mdash',
        'new', 'unique', 'action', 'abilities'],
       ['rsquo', 'rdquo', 'red', 'ldquo', 'ndash', 'bull', 'black',
        'blue', 'green', 'white'],
       ['war', 'battle', 'at

### Create Probability Matrix

In [17]:
def predict_proba(model, vectorizer, text):
    if type(text) == str:
        text = [text]
    vec_text = vectorizer.transform(text)
    doc_probs = model.transform(vec_text)
    return doc_probs

In [18]:
game_probs = predict_proba(lda, count_vectorizer, worddata.description)

In [19]:
#pickle the fitted transformed model

with open('../models/lda.pickle', 'wb') as f:
    pickle.dump([lda,count_vectorizer,game_probs], f)

## Building the Recommender

In [22]:
# function to find index of FIRST  game given the game title

def find_game_idx(df, game_title):
    try:
        return df.name[df.name.str.contains(game_title)].index[0]
    except:
        return None

def find_game_id(df, idx):
    return df['game.id'].iloc[idx]


## TODO: if search returns multiple results, what then?? Need to solve this

In [23]:
search = worddata.name.str.contains('Betrayal')
search.sum()


11

## /TODO

In [25]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

In [26]:
def sort_by_distance(idx, probs, distance_funct=cosine_distances, num_games=10):
    #returns array of indices of games closest to the provided game index
    return distance_funct(probs[idx].reshape(1,-1), probs).ravel().argsort()[:10]

* Debug sort_by_distance function
* check distances inside the function


In [27]:
def find_closest_games_names(sorted_distances, names):
    #finds names of an array of game idx's
    name_array = names.iloc[sorted_distances]
    return name_array

In [34]:
game_index = find_game_idx(worddata,'Catan')
game_sim = sort_by_distance(game_index, game_probs)
find_closest_games_names(game_sim,worddata.name)


12                                          Catan
54254                              Pocket Paddles
71292                       Dice City: Shipwright
62950                          Small Star Empires
68682          Dice Bazaar: Monkey Mini Expansion
26389                         El Juego de la Liga
132                             Buy Low Sell High
5131                      The Ark of the Covenant
63732    All About Town: Columbia, South Carolina
2013                                      Wembley
Name: name, dtype: object

In [65]:
def recommend(df, game_title, prob_matrix, 
             distance_funct=cosine_distances, num_games=10):
    game_index = find_game_idx(df, game_title)
    game_sim = sort_by_distance(game_index, prob_matrix, distance_funct, num_games)
    game_recs = find_closest_games_names(game_sim, df.name)
    game_ids = find_game_id(df, game_rec.index)
    
    rec_df = pd.DataFrame({'id':game_ids,'name':game_recs})
    
    return rec_df

In [71]:
game_rec = recommend(worddata, 'Catan', game_probs)
game_rec

Unnamed: 0,id,name
12,13,Catan
54254,150834,Pocket Paddles
71292,206255,Dice City: Shipwright
62950,178044,Small Star Empires
68682,199939,Dice Bazaar: Monkey Mini Expansion
26389,35371,El Juego de la Liga
132,146,Buy Low Sell High
5131,6779,The Ark of the Covenant
63732,180713,"All About Town: Columbia, South Carolina"
2013,2721,Wembley


In [69]:
cover_url = pd.read_csv('../data/cover_url.csv')
cover_url.drop('Unnamed: 0', inplace=True, axis=1)
cover_url.head()

Unnamed: 0,id,cover_url
0,1,http://cf.geekdo-images.com/images/pic159509.jpg
1,2,http://cf.geekdo-images.com/images/pic184174.jpg
2,3,http://cf.geekdo-images.com/images/pic3211873.jpg
3,4,http://cf.geekdo-images.com/images/pic285299.jpg
4,11,http://cf.geekdo-images.com/images/pic69366.jpg


In [99]:
cover_url.id[0]

1

In [87]:
game_rec.id.values

array([    13, 150834, 206255, 178044, 199939,  35371,    146,   6779,
       180713,   2721])

In [92]:
cover_url.id.reindex(game_rec.id.values)

13           13.0
150834        NaN
206255        NaN
178044        NaN
199939        NaN
35371     38076.0
146         146.0
6779       7230.0
180713        NaN
2721       2931.0
Name: id, dtype: float64

In [None]:
def _recommend_game(game_title, num_games)
    """Generate recommended games as a list of lists. Each follows the pattern [game_title, cover_url]
    Args:
        game_title (str) : search string of user game
        num_games (int) : number of similiar games to recommend
    Returns:
        a list of lists. Each follows the pattern [game_title, cover_url]
    """
    
    game_list = _recommend_game(game_title, num_games)

In [61]:
#save prob matrix for use later
with open('../models/prob_matrix.pickle', 'wb') as f:
    pickle.dump(game_probs,f)

## TODO: use suprise Library to test