In [1]:
import psycopg2 as pg2
import pandas as pd
import pprint 
import string
import unicodedata
import numpy as np
import pickle

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import decomposition

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from nltk import pos_tag
from nltk import RegexpParser
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

## Import pickled cleaned data from previous project

In [3]:
with open('../data/boardgamedata.pickle', 'rb') as f:
    worddata, numdata = pickle.load(f)

In [5]:
worddata.head()

Unnamed: 0_level_0,details.name,details.description,category,mechanic
game.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Die Macher,Die Macher is a game about seven sequential po...,"[Economic, Negotiation, Political]","[Area Control, Area Influence, Auction/Bidding..."
2,Dragonmaster,Dragonmaster is a trick-taking card game based...,"[Card Game, Fantasy]",[Trick-taking]
3,Samurai,"Part of the Knizia tile-laying trilogy, Samura...","[Abstract Strategy, Medieval]","[Area Control, Area Influence, Hand Management..."
4,Tal der Könige,When you see the triangular box and the luxuri...,[Ancient],"[Action Point Allowance System, Area Control, ..."
11,Bohnanza,Bohnanza is the first in the Bohnanza family o...,"[Card Game, Farming, Negotiation]","[Hand Management, Set Collection, Trading]"


In [8]:
numdata.set_index('game.id', drop=True, inplace=True)

In [11]:
numdata.describe()

Unnamed: 0,details.maxplayers,details.maxplaytime,details.minage,details.minplayers,details.minplaytime,details.playingtime,details.yearpublished,stats.average,stats.averageweight,stats.bayesaverage,stats.numcomments,stats.owned,stats.stddev,stats.trading,stats.usersrated,stats.wanting,stats.wishing
count,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90397.0,90400.0,90400.0,90400.0,90400.0,90400.0,90400.0,90400.0,90400.0,90400.0,90400.0
mean,5.732004,51.773731,7.121143,1.982533,46.104506,51.773731,1814.056584,4.309627,0.86614,1.146279,36.044502,234.105608,0.94554,7.860996,123.134192,9.596858,34.333208
std,53.449543,331.657193,5.056647,0.860299,316.313227,331.657193,580.656445,2.927143,1.155766,2.300701,235.494572,1483.897861,0.989744,36.53196,1048.474551,50.702656,224.203389
min,0.0,0.0,0.0,0.0,0.0,0.0,-3500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,5.0,0.0,2.0,10.0,5.0,1986.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,30.0,8.0,2.0,30.0,30.0,2005.0,5.4,0.0,0.0,2.0,8.0,0.986687,0.0,3.0,0.0,1.0
75%,6.0,60.0,12.0,2.0,60.0,60.0,2012.0,6.66667,1.75,0.0,9.0,58.0,1.535955,2.0,18.0,3.0,8.0
max,11299.0,60120.0,120.0,50.0,60120.0,60120.0,2019.0,10.0,5.0,8.48966,13841.0,95401.0,4.5,1858.0,67655.0,1838.0,9082.0


## Build LDA Model

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
lda = LatentDirichletAllocation(learning_method='online', n_jobs=-2, random_state=9)

In [15]:
#define common words in data to remove
#built from pervious project

COMMON_WORDS = ["game","point","board","player","rule","turn",
                    "tile","card","deck","hand","points","victory",
                    "win","lose","defeat","quot"]

sw = set(stopwords.words('english'))
sw.update(COMMON_WORDS)

In [17]:
#Try CountVectorizer for now

from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(stop_words=sw, max_df=0.85, min_df=2, max_features=1000)
word_vec = tf_vectorizer.fit_transform(worddata['details.description'])

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(learning_method='online', n_jobs=-2, random_state=1659)

In [19]:
lda.fit(word_vec)

LatentDirichletAllocation(learning_method='online', n_jobs=-2,
                          random_state=1659)

In [20]:
import joblib

joblib.dump(lda, '../data/lda_model.joblib')
joblib.dump(tf_vectorizer, '../data/tf_vec.joblib')

['../data/tf_vec.joblib']

In [21]:
def top_topic_features(model, feature_names, num_features=10):
    sorted_topics = feature_names[model.components_.argsort(axis=1)[:, ::-1][:, :num_features]]
    return sorted_topics

In [22]:
feature_names = np.array(tf_vectorizer.get_feature_names())
top_topic_features(lda, feature_names=feature_names)

array([['war', 'battle', 'rules', 'units', 'map', 'combat', 'army',
        'attack', 'counters', 'one'],
       ['tokens', 'fun', 'players', 'play', 'get', 'around', 'fast',
        'family', 'token', 'find'],
       ['players', 'rsquo', 'one', 'city', 'time', 'action', 'take',
        'may', 'end', 'must'],
       ['move', 'pieces', 'space', 'one', 'players', 'first', 'spaces',
        'two', 'piece', 'race'],
       ['dice', 'roll', 'die', '140', '231', 'bull', '184', 'rolls',
        'rolling', 'monopoly'],
       ['cards', '229', 'play', 'players', 'draw', 'played', 'pack',
        'pile', 'one', '136'],
       ['one', 'players', 'two', 'tiles', '208', 'number', 'first',
        'play', 'three', 'score'],
       ['team', 'ship', 'word', 'questions', 'answer', 'words',
        'question', 'ships', 'teams', 'players'],
       ['227', '129', '130', '131', '128', '230', '188', 'de', '139',
        '174'],
       ['new', 'expansion', 'set', 'rules', 'play', 'character', 'games',
      

### Remove numbers from vectorizer

In [23]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [24]:
count_vectorizer = CountVectorizer(stop_words=sw, preprocessor=preprocess_text, max_df=0.85,
                                  min_df=2, max_features=1000)
word_vec_2 = count_vectorizer.fit_transform(worddata['details.description'])

array([['wargame', 'chooses', 'secret', 'used', 'moving', 'depending',
        'cavalry', 'challenge', 'edge', 'pawn'],
       ['track', 'higher', 'provides', 'provide', 'houses', 'center',
        'get', 'general', 'town', 'gold'],
       ['provides', 'search', 'pawn', 'cut', 'top', 'become', 'task',
        'necessary', 'find', 'operations'],
       ['often', 'powers', 'squares', 'pawn', 'provides', 'great', 'st',
        'union', 'powerful', 'remaining'],
       ['ever', 'scale', 'every', 'addition', 'attempt', 'commander',
        'amp', 'score', 'scenarios', 'object'],
       ['consists', 'attacking', 'provide', 'provides', 'experience',
        'provided', 'pirate', 'pre', 'pawn', 'active'],
       ['pawn', 'provides', 'union', 'tokens', 'around', 'pages',
        'great', 'provide', 'times', 'series'],
       ['themed', 'simulation', 'words', 'region', 'case', 'work', 'red',
        'simultaneously', 'things', 'provides'],
       ['assault', 'according', 'acquire', 'across', 'ab

In [27]:
lda2 = LatentDirichletAllocation(learning_method='online', n_jobs=-2, 
                                random_state=1659)
lda2.fit(word_vec_2)

LatentDirichletAllocation(learning_method='online', n_jobs=-2,
                          random_state=1659)

In [None]:
#pickle the fitted transformed model

with open('../models/lda.pickle', 'wb') as f:
    pickle.dump(lda2)
import pickle
with open('boardgamedata.pickle','wb') as f:
    pickle.dump([worddata_clean, numdata], f)

## Building the Recommender

In [None]:
# write function to find index of game given the game title

def find_game_idx(df, game_title);
    return 