In [1]:
import psycopg2 as pg2
import pandas as pd
import pprint 
import string
import unicodedata
import numpy as np
import pickle

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import decomposition

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from nltk import pos_tag
from nltk import RegexpParser
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

## Import pickled cleaned data from previous project

In [118]:
with open('../data/boardgamedata.pickle', 'rb') as f:
    worddata, numdata = pickle.load(f)

In [119]:
worddata.head()

Unnamed: 0,game.id,name,description,category,mechanic
0,1,Die Macher,Die Macher is a game about seven sequential po...,"[Economic, Negotiation, Political]","[Area Control, Area Influence, Auction/Bidding..."
1,2,Dragonmaster,Dragonmaster is a trick-taking card game based...,"[Card Game, Fantasy]",[Trick-taking]
2,3,Samurai,"Part of the Knizia tile-laying trilogy, Samura...","[Abstract Strategy, Medieval]","[Area Control, Area Influence, Hand Management..."
3,4,Tal der Könige,When you see the triangular box and the luxuri...,[Ancient],"[Action Point Allowance System, Area Control, ..."
4,11,Bohnanza,Bohnanza is the first in the Bohnanza family o...,"[Card Game, Farming, Negotiation]","[Hand Management, Set Collection, Trading]"


In [120]:
# numdata.set_index('game.id', drop=True, inplace=True)

In [121]:
numdata.describe()

Unnamed: 0,game.id,details.maxplayers,details.maxplaytime,details.minage,details.minplayers,details.minplaytime,details.playingtime,details.yearpublished,stats.average,stats.averageweight,stats.bayesaverage,stats.numcomments,stats.owned,stats.stddev,stats.trading,stats.usersrated,stats.wanting,stats.wishing
count,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0
mean,88668.795414,5.732004,51.773731,7.121143,1.982533,46.104506,51.773731,1814.056584,4.309622,0.866108,1.146317,36.045477,234.112493,0.945541,7.861234,123.137881,9.597155,34.334226
std,70144.573869,53.449543,331.657193,5.056647,0.860299,316.313227,331.657193,580.656445,2.927135,1.155742,2.300729,235.498417,1483.921999,0.989753,36.532542,1048.491753,50.703471,224.207039
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-3500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24562.0,2.0,5.0,0.0,2.0,10.0,5.0,1986.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,68216.0,4.0,30.0,8.0,2.0,30.0,30.0,2005.0,5.4,0.0,0.0,2.0,8.0,0.986568,0.0,3.0,0.0,1.0
75%,152682.0,6.0,60.0,12.0,2.0,60.0,60.0,2012.0,6.66667,1.75,0.0,9.0,58.0,1.53597,2.0,18.0,3.0,8.0
max,226264.0,11299.0,60120.0,120.0,50.0,60120.0,60120.0,2019.0,10.0,5.0,8.48966,13841.0,95401.0,4.5,1858.0,67655.0,1838.0,9082.0


## Build LDA Model

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
lda = LatentDirichletAllocation(learning_method='online', n_jobs=-2, random_state=9)

In [34]:
#define common words in data to remove
#built from pervious project
sw = set(stopwords.words('english'))

COMMON_WORDS = ["game","point","board","player","rule","turn",
                "tile","card","deck","hand","points","victory",
                "win","lose","defeat","quot","players","play"
               ]
sw.update(COMMON_WORDS)

In [17]:
#Try CountVectorizer for now

from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(stop_words=sw, max_df=0.85, min_df=2, max_features=1000)
word_vec = tf_vectorizer.fit_transform(worddata['description'])

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(learning_method='online', n_jobs=-2, random_state=1659)

In [19]:
lda.fit(word_vec)

LatentDirichletAllocation(learning_method='online', n_jobs=-2,
                          random_state=1659)

In [20]:
import joblib

joblib.dump(lda, '../data/lda_model.joblib')
joblib.dump(tf_vectorizer, '../data/tf_vec.joblib')

['../data/tf_vec.joblib']

In [21]:
def top_topic_features(model, feature_names, num_features=10):
    sorted_topics = feature_names[model.components_.argsort(axis=1)[:, ::-1][:, :num_features]]
    return sorted_topics

In [22]:
feature_names = np.array(tf_vectorizer.get_feature_names())
top_topic_features(lda, feature_names=feature_names)

array([['war', 'battle', 'rules', 'units', 'map', 'combat', 'army',
        'attack', 'counters', 'one'],
       ['tokens', 'fun', 'players', 'play', 'get', 'around', 'fast',
        'family', 'token', 'find'],
       ['players', 'rsquo', 'one', 'city', 'time', 'action', 'take',
        'may', 'end', 'must'],
       ['move', 'pieces', 'space', 'one', 'players', 'first', 'spaces',
        'two', 'piece', 'race'],
       ['dice', 'roll', 'die', '140', '231', 'bull', '184', 'rolls',
        'rolling', 'monopoly'],
       ['cards', '229', 'play', 'players', 'draw', 'played', 'pack',
        'pile', 'one', '136'],
       ['one', 'players', 'two', 'tiles', '208', 'number', 'first',
        'play', 'three', 'score'],
       ['team', 'ship', 'word', 'questions', 'answer', 'words',
        'question', 'ships', 'teams', 'players'],
       ['227', '129', '130', '131', '128', '230', '188', 'de', '139',
        '174'],
       ['new', 'expansion', 'set', 'rules', 'play', 'character', 'games',
      

#### Remove numbers from vectorizer

Implement preprocess function

In [23]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [35]:
count_vectorizer = CountVectorizer(stop_words=sw, preprocessor=preprocess_text, max_df=0.85,
                                  min_df=2, max_features=1000)

In [36]:
word_vec_2 = count_vectorizer.fit_transform(worddata['description'])

In [37]:
lda2 = LatentDirichletAllocation(learning_method='online', n_jobs=-2, 
                                random_state=1659)
lda2.fit(word_vec_2)

LatentDirichletAllocation(learning_method='online', n_jobs=-2,
                          random_state=1659)

In [38]:
feature_names2 = np.array(count_vectorizer.get_feature_names())
top_topic_features(lda2, feature_names=feature_names2)


array([['team', 'city', 'tokens', 'money', 'resources', 'build', 'phase',
        'power', 'building', 'token'],
       ['dice', 'move', 'one', 'roll', 'die', 'first', 'pieces', 'space',
        'two', 'spaces'],
       ['new', 'rules', 'expansion', 'cards', 'set', 'games', 'includes',
        'ship', 'also', 'edition'],
       ['questions', 'answer', 'eacute', 'party', 'question', 'la',
        'guess', 'correct', 'correctly', 'variant'],
       ['de', 'fun', 'treasure', 'monster', 'friends', 'find', 'dark',
        'gold', 'items', 'dungeon'],
       ['time', 'get', 'one', 'make', 'like', 'word', 'many', 'way',
        'also', 'best'],
       ['cards', 'one', 'round', 'two', 'tiles', 'wins', 'number',
        'first', 'played', 'three'],
       ['cards', 'character', 'characters', 'use', 'heroes', 'mdash',
        'new', 'unique', 'action', 'abilities'],
       ['rsquo', 'rdquo', 'red', 'ldquo', 'ndash', 'bull', 'black',
        'blue', 'green', 'white'],
       ['war', 'battle', 'at

In [41]:
#pickle the fitted transformed model

with open('../models/lda.pickle', 'wb') as f:
    pickle.dump(lda2, f)

## Building the Recommender

In [73]:
# write function to find index of game given the game title

def find_game_idx(df, game_title):
    try:
        return df.name[df.name.str.contains(game_title)].index[0]
    except:
        return None

In [55]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

In [80]:
def sort_by_distance(idx, probs, distance_funct=cosine_distances, num_games=10):
    #returns array of indices of games closest to the provided game index
    
    distances = distance_funct(probs[idx].reshape(1,-1), probs).ravel()
    return distances.argsort()[:num_games]

In [76]:
def predict_proba(model, vectorizer, text):
    if type(text) == str:
        text = [text]
    vec_text = vectorizer.transform(text)
    doc_probs = model.transform(vec_text)
    return doc_probs

In [128]:
game_probs = predict_proba(lda2, count_vectorizer, worddata.description)

In [139]:
def find_closest_games_names(sorted_distances, names):
    name_array = names.iloc[sorted_distances]
    return {name_array}

In [140]:
game_index = find_game_idx(worddata,'Catan')
game_sim = sort_by_distance(game_index, game_probs)
game_rec = find_closest_games(game_sim,worddata.name)

In [142]:
game_rec

{'Basari': 68920    Football Stars: Jogo de Cartas Ilustradas
 2501                                 Frachtexpress
 39762                         Gronewold Demolition
 71603                                   Grand Prix
 65787                                     Fracture
 69494                  Intrigue in the Royal Court
 542                                         Wongar
 80141                                   Jewel Duel
 68592                           Kingsport Festival
 Name: name, dtype: object}

In [143]:
worddata.name.reindex(game_sim)

13                                       Catan
55665                                Vocambolo
1997                            Tyranid Attack
32043                 Mit Pauken und Trompeten
58098         Dungeon Command: Heart of Cormyr
52947                         Banzuke Shoushin
56195                                      NaN
467                          Aladdin's Dragons
65526    Het grote Pardoes Spring-en-spaarspel
55375                           Eesti Jalgpall
Name: name, dtype: object

In [126]:
worddata.loc[9833]

game.id                                                    10547
name                               Betrayal at House on the Hill
description    From the press release:&#10;&#10;Betrayal at H...
category            [Adventure, Exploration, Horror, Miniatures]
mechanic       [Co-operative Play, Dice Rolling, Modular Boar...
Name: 9833, dtype: object

In [127]:
worddata.name[worddata.name.str.contains('Betrayal')].index[0]

9833