# Steam Collection Exploration

## Import Libraries

In [430]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import skipgrams
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.util import ngrams
import math

In [84]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

In [85]:
stopWords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## Import Steam data

In [3]:
df = pd.read_csv('data/steam_description_data.csv')

In [6]:
df = df[['steam_appid', 'detailed_description']]

In [75]:
titles = pd.read_csv('data/steam.csv')

In [86]:
titles = titles[['appid', 'name']]

In [89]:
titles['name'] = titles['name'].str.lower()

In [90]:
titles.head()

Unnamed: 0,appid,name
0,10,counter-strike
1,20,team fortress classic
2,30,day of defeat
3,40,deathmatch classic
4,50,half-life: opposing force


## Generate SkipGrams

In [93]:
def skipgram(corpus, window_size = 2):
    sg = []
    for sent in corpus:
        sent = sent.split()
        if len(sent) <= window_size:
            sg.append(sent)
        else:
            non_stop_count = 0
            for s in sent:
                if s not in stopWords:
                    non_stop_count += 1
            print(non_stop_count)
            for i in range(0, len(sent)-window_size+1):
                sg.append(sent[i: i+window_size])
    return sg

In [94]:
titles['name'].apply(lambda x: skipgram([x], 2))

3
2
3
3
3
3
3
4
3
4
4
3
3
4
3
4
4
3
4
4
4
6
3
4
5
3
5
6
4
3
4
4
3
3
4
4
2
3
4
4
3
2
3
3
2
3
4
4
4
3
5
4
5
4
6
2
5
3
6
5
5
4
5
3
3
3
4
3
3
3
3
3
5
5
8
7
7
3
5
5
5
3
3
3
3
4
2
3
4
4
7
8
6
2
2
3
5
3
3
2
3
4
4
6
4
4
4
4
4
3
3
5
4
3
3
4
5
3
3
4
5
3
3
4
3
3
2
3
5
3
4
3
3
2
2
4
4
8
4
6
4
5
4
3
2
3
6
6
4
7
4
4
6
3
3
4
4
2
5
3
4
3
3
2
4
5
3
4
4
4
3
6
3
3
4
4
6
4
5
4
3
4
5
4
4
5
5
6
3
4
3
4
6
3
2
4
4
4
4
3
4
3
4
6
3
3
3
4
5
3
2
4
4
3
3
4
6
5
4
4
5
4
6
6
4
6
5
3
5
6
3
3
5
5
4
3
6
4
3
7
7
4
3
3
5
2
5
3
3
3
4
3
3
4
5
4
3
4
4
4
5
4
4
3
3
5
4
3
3
6
3
6
3
4
4
4
3
3
3
2
3
4
6
5
2
3
4
3
7
5
6
3
3
5
5
3
3
5
3
3
4
2
3
5
4
2
7
7
5
3
3
3
4
4
4
6
3
3
5
2
3
3
2
5
4
4
5
4
3
3
3
4
4
6
8
6
3
3
4
3
2
3
2
4
2
3
3
4
3
3
3
2
5
2
6
4
4
7
4
5
5
6
5
6
5
5
5
5
4
6
5
3
3
3
4
5
3
5
6
7
7
5
7
8
6
6
7
7
5
7
4
3
4
5
3
4
8
3
3
3
2
5
3
3
4
3
5
4
7
6
3
7
3
3
9
4
3
2
6
5
4
5
2
2
4
4
4
3
4
4
3
3
4
6
7
3
3
4
3
4
5
3
4
3
2
2
3
3
4
3
6
3
2
4
4
4
11
2
3
3
4
4
4
3
4
3
5
2
2
2
3
6
7
4
4
3
5
3
3
3
4
4
3
4
3
2
6
3
3
3
3
5
4
4
3
3
4
5
4
3

4
3
4
2
3
2
6
2
4
3
3
4
2
4
4
5
2
6
2
3
3
2
4
2
4
3
5
4
2
3
3
4
2
2
2
2
5
4
4
3
4
4
6
6
2
4
3
4
3
5
2
3
4
3
2
2
3
3
3
3
3
3
4
3
5
2
2
3
4
2
3
3
2
7
3
3
2
2
3
4
4
2
4
7
5
4
3
2
4
3
3
4
3
3
2
2
2
6
3
3
2
3
2
3
3
7
5
4
2
3
3
2
4
3
3
4
4
2
2
5
4
4
4
3
3
3
2
4
3
2
3
4
2
2
3
5
3
2
3
4
3
4
2
3
3
5
2
4
3
3
3
4
4
2
3
5
3
2
2
4
6
3
2
5
3
4
2
2
4
3
3
6
2
4
6
3
7
2
3
3
2
3
4
2
4
3
4
3
6
3
2
3
4
4
2
2
2
3
2
3
3
3
3
8
2
2
3
3
6
3
2
4
3
3
3
3
5
3
6
3
3
3
3
4
4
4
3
5
3
2
5
4
3
3
3
1
3
3
3
3
6
6
3
3
6
3
4
2
5
2
5
3
4
5
3
3
2
4
3
3
5
6
2
2
3
10
4
4
3
4
6
3
4
4
3
3
4
2
4
3
4
4
3
3
2
4
8
4
3
6
5
3
3
4
3
6
4
2
4
3
2
2
4
3
3
3
5
2
7
3
2
5
4
3
6
3
2
5
2
3
6
7
4
3
3
5
5
4
3
3
3
4
3
3
3
3
3
2
2
4
2
5
2
3
2
3
5
3
1
2
3
3
3
2
4
2
2
7
7
4
4
6
7
8
2
3
3
2
3
2
3
5
5
6
6
5
3
6
3
2
2
3
4
3
3
4
2
2
2
3
4
3
3
5
4
3
2
3
3
2
3
5
4
1
4
2
4
3
6
4
3
2
4
2
7
3
3
4
3
3
5
4
5
3
3
5
4
5
4
2
2
3
4
3
3
3
3
4
3
3
3
3
2
3
4
6
5
6
3
10
8
3
3
4
3
3
3
3
3
3
4
3
3
2
2
3
3
1
4
4
3
3
5
3
2
2
4
2
3
3
3
2
3
3
3
6
3
3
5
3
1
2
3
4
2
3
3
3
3


4
3
4
4
5
3
3
3
2
4
6
3
2
3
3
4
3
2
8
3
2
6
3
3
5
2
5
6
6
5
3
3
3
2
4
3
3
2
2
4
3
2
7
4
3
2
4
6
4
1
1
3
3
2
6
2
2
3
4
3
5
6
4
4
3
3
5
2
3
2
3
3
4
3
4
3
6
3
4
4
4
5
1
3
3
4
5
7
3
3
4
4
6
3
4
4
3
9
4
4
5
3
4
3
5
5
3
2
3
2
2
3
3
2
3
4
4
5
3
3
5
5
3
3
5
4
3
3
3
3
3
3
3
4
6
2
3
3
6
2
2
3
14
3
3
4
3
4
2
3
2
4
3
3
3
3
8
2
4
2
4
8
5
2
3
3
2
6
3
6
4
3
3
3
3
4
3
5
4
8
3
5
2
3
2
3
3
3
2
3
1
2
3
4
3
5
5
6
9
4
7
5
6
3
3
3
9
3
3
3
3
2
5
5
3
2
3
4
3
3
3
5
3
3
5
3
3
4
3
4
2
4
2
5
4
4
3
2
3
3
3
2
2
2
2
5
3
4
4
6
3
4
4
3
4
2
3
5
2
3
3
4
6
4
5
5
3
3
3
3
6
2
2
5
2
3
3
3
3
3
3
4
2
4
4
5
3
8
5
4
5
4
5
3
2
2
3
6
4
2
3
3
4
4
3
3
3
3
2
2
3
3
5
5
4
3
5
3
4
3
3
3
7
3
6
3
3
10
10
10
7
7
9
7
3
3
2
2
2
4
3
4
2
3
3
3
2
2
4
2
5
3
3
4
3
8
2
4
5
2
3
4
3
2
4
2
2
3
3
5
3
3
3
3
3
4
7
11
7
3
2
3
3
4
4
3
2
4
6
3
4
6
4
3
4
4
3
2
10
2
2
2
7
2
3
6
3
4
8
2
6
4
5
5
2
3
4
3
3
5
3
3
3
3
8
3
3
4
5
2
3
7
4
2
2
3
3
3
4
3
4
3
4
3
4
3
3
3
3
2
2
3
3
2
3
2
3
3
3
2
3
5
4
3
2
3
3
5
3
3
4
2
4
2
3
2
4
3
2
4
3
9
7
5
3
2
9
3
3
3
2
1
2
6
2
4
3


0                                       [[counter-strike]]
1                  [[team, fortress], [fortress, classic]]
2                                [[day, of], [of, defeat]]
3                                  [[deathmatch, classic]]
4              [[half-life:, opposing], [opposing, force]]
                               ...                        
27070                          [[room, of], [of, pandora]]
27071                                       [[cyber, gun]]
27072                       [[super, star], [star, blast]]
27073    [[new, yankee], [yankee, 7:], [7:, deer], [dee...
27074                                       [[rune, lord]]
Name: name, Length: 27075, dtype: object

# Full Descriptions?

## Lets tokenize the sentences

In [16]:
from html.parser import HTMLParser

In [50]:
def strip_html(html):
    soup = BeautifulSoup(html, "html.parser").get_text(separator=' ')
    no_space = re.sub('\s+',' ',soup)
    return no_space.replace('\\', '')

In [57]:
df['detailed_description'] = df['detailed_description'].apply(lambda x: strip_html(x)).to_list()

In [58]:
df['sentences'] = df['detailed_description'].apply(lambda x: sent_tokenize(x))

In [522]:
df['sentences'].to_list()

[["Play the world's number 1 online action game.",
  'Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game.',
  'Ally with teammates to complete strategic missions.',
  'Take out enemy sites.',
  'Rescue hostages.',
  "Your role affects your team's success.",
  "Your team's success affects your role."],
 ['One of the most popular online action games of all time, Team Fortress Classic features over nine character classes -- from Medic to Spy to Demolition Man -- enlisted in a unique style of online team warfare.',
  'Each character class possesses unique weapons, items, and abilities, as teams compete online in a variety of game play modes.'],
 ['Enlist in an intense brand of Axis vs. Allied teamplay set in the WWII European Theatre of Operations.',
  'Players assume the role of light/assault/heavy infantry, sniper or machine-gunner class, each with a unique arsenal of historical weaponry at their disposal.',
  'Missions are based on key hi

## Generate N-Grams

In [292]:
def skipgram(tokens, window_size = 2):
    #print(tokens)
    sg = []
    sent = tokens
    non_stop_count = 0
    for s in sent:
        if s not in stopWords:
            non_stop_count += 1
    #print(non_stop_count)
    if window_size == 1:
        sg = [(s,) for s in sent if s not in stopWords]
    elif non_stop_count == window_size:
        sg.append(tuple(sent))
    elif non_stop_count < window_size or window_size < 1:
        return sg
    else:
        start = 0
        for i in range(0, non_stop_count-window_size+1):
            #sg.append(sent[i: i+window_size])
            gram = []
            gram_size = 0
            #print(start)
            for index, word in enumerate(sent[start:]):
                #print(index)
                gram.append(word)
                if word not in stopWords:
                    gram_size += 1
                    if(gram_size == 2):
                        start += index
                if gram_size == window_size:
                    break;
            #print(gram)
            sg.append(tuple(gram))
                        
    return sg

In [293]:
test_sent = ['day', 'of','the','dead', '2']

In [222]:
skipgram(test_sent, 2)

[('day', 'of', 'the', 'dead'), ('dead', '2')]

In [223]:
for t in ngrams(test_sent, 2):
    print(t)

('day', 'of')
('of', 'the')
('the', 'dead')
('dead', '2')


In [227]:
generate_ngrams("where is waldo's house and is it broken who where", 0)

[]

In [524]:
def create_tokens(s):
    s = s.lower()
    s = re.sub(r'’', '\'', s)
    s = re.sub(r'[^a-zA-Z0-9\s\']', ' ', s)
    return [token for token in s.split(" ") if token != ""]
def generate_ngrams(s, n):
    tokens = create_tokens(s)
    return list(skipgram(tokens, n))

In [501]:
doc_freq = {}
tri_grams = {}
def create_trigrams(sentences):
    seen_words = set()
    for sent in sentences:
        tgs = generate_ngrams(sent, 3)
        for t in tgs:
            if 'http' in t or 'https' in t or 'com' in t:
                continue
            seen_words.add(t)
            if t in tri_grams:
                tri_grams[t] = tri_grams[t]+1
            else:
                tri_grams[t] = 1
    for sw in seen_words:
        if sw in doc_freq:
            doc_freq[sw] = doc_freq[sw]+1
        else:
            doc_freq[sw] = 1

In [502]:
bi_grams = {}
def create_bigrams(sentences):
    seen_words = set()
    for sent in sentences:
        tgs = generate_ngrams(sent, 2)
        for t in tgs:
            #print(t)
            if 'http' in t or 'https' in t or 'com' in t:
                continue
            seen_words.add(t)
            if t in bi_grams:
                bi_grams[t] = bi_grams[t]+1
            else:
                bi_grams[t] = 1
    for sw in seen_words:
        if sw in doc_freq:
            doc_freq[sw] = doc_freq[sw]+1
        else:
            doc_freq[sw] = 1

In [503]:
one_grams = {}
def create_onegrams(sentences):
    seen_words = set()
    for sent in sentences:
        tgs = generate_ngrams(sent, 1)
        for t in tgs:
            #print(t)
            if 'http' in t or 'https' in t or 'com' in t:
                continue
            seen_words.add(t)
            if t in one_grams:
                one_grams[t] = one_grams[t]+1
            else:
                one_grams[t] = 1
    for sw in seen_words:
        if sw in doc_freq:
            doc_freq[sw] = doc_freq[sw]+1
        else:
            doc_freq[sw] = 1

In [504]:
df['sentences'].apply(lambda x: create_trigrams(x))
df['sentences'].apply(lambda x: create_bigrams(x))
df['sentences'].apply(lambda x: create_onegrams(x))

0        None
1        None
2        None
3        None
4        None
         ... 
27329    None
27330    None
27331    None
27332    None
27333    None
Name: sentences, Length: 27334, dtype: object

In [505]:
tri_grams = {k: v for k, v in tri_grams.items() if v != 1}

In [506]:
bi_grams = {k: v for k, v in bi_grams.items() if v != 1}

In [507]:
doc_freq = {k:v for k,v in doc_freq.items() if k in tri_grams or k in bi_grams or k in one_grams}

In [511]:
len(doc_freq)

540620

In [512]:
len(tri_grams) + len(bi_grams) + len(one_grams)

540620

In [510]:
len(tri_grams), len(bi_grams), len(one_grams)

(127713, 316126, 96781)

In [390]:
sortedx = sorted(tri_grams.items(), key=lambda kv: kv[1])
sortedx.reverse()
sortedx[:20]

[(('local', 'co', 'op'), 364),
 (('real', 'time', 'strategy'), 331),
 (('first', 'person', 'shooter'), 328),
 (('single', 'player', 'campaign'), 315),
 (('steam', 'trading', 'cards'), 278),
 (('different', 'game', 'modes'), 275),
 (('full', 'controller', 'support'), 260),
 (('turn', 'based', 'combat'), 254),
 (('turn', 'based', 'strategy'), 244),
 (('world', 'war', 'ii'), 214),
 (('point', 'and', 'click', 'adventure'), 206),
 (('co', 'op', 'mode'), 202),
 (('role', 'playing', 'game'), 200),
 (('single', 'player', 'mode'), 198),
 (('player', 'co', 'op'), 194),
 (('fast', 'paced', 'action'), 193),
 (('unreal', 'engine', '4'), 188),
 (('puzzle', 'adventure', 'game'), 187),
 (('object', 'puzzle', 'adventure'), 160),
 (('tower', 'defense', 'game'), 152)]

In [391]:
sortedx = sorted(bi_grams.items(), key=lambda kv: kv[1])
sortedx.reverse()
sortedx[:20]

[(('key', 'features'), 2990),
 (('game', 'modes'), 2139),
 (('single', 'player'), 2067),
 (('co', 'op'), 1792),
 (('real', 'time'), 1648),
 (('fast', 'paced'), 1591),
 (('store', 'steampowered'), 1552),
 (('first', 'person'), 1478),
 (('turn', 'based'), 1361),
 (('adventure', 'game'), 1216),
 (('puzzle', 'game'), 1214),
 (('game', 'features'), 1147),
 (('virtual', 'reality'), 1097),
 (('open', 'world'), 1037),
 (('early', 'access'), 1005),
 (('steam', 'achievements'), 997),
 (('power', 'ups'), 995),
 (('mini', 'games'), 896),
 (('game', 'play'), 893),
 (('procedurally', 'generated'), 870)]

In [392]:
sortedx = sorted(one_grams.items(), key=lambda kv: kv[1])
sortedx.reverse()
sortedx[:20]

[(('game',), 60183),
 (('new',), 22838),
 (('world',), 20064),
 (('play',), 18218),
 (('features',), 16201),
 (('time',), 15554),
 (('one',), 13995),
 (('different',), 13504),
 (('players',), 12229),
 (('levels',), 11774),
 (('player',), 11212),
 (('mode',), 11182),
 (('unique',), 11146),
 (('get',), 10765),
 (('story',), 10730),
 (('use',), 10107),
 (('find',), 9946),
 (('experience',), 9846),
 (('way',), 9438),
 (('like',), 9234)]

# calculating tfidf ~ P(Ci|Qt)

In [663]:
ogdf = pd.DataFrame.from_dict(one_grams, orient='index', columns=['freq'])

In [664]:
ogdf = ogdf.reset_index()

In [665]:
ogdf = ogdf.rename(columns={"index": "word"})

In [666]:
ogdf['tfidf'] = ogdf.apply(lambda x: x['freq'] * math.log2(len(df)/doc_freq[x['word']]), axis=1)

In [667]:
ogdf

Unnamed: 0,word,freq,tfidf
0,"(play,)",18218,24917.151544
1,"(world's,)",613,3451.119210
2,"(number,)",1919,8015.895258
3,"(1,)",4332,14348.208766
4,"(online,)",4509,14955.767141
...,...,...,...
96776,"(6120,)",1,14.738409
96777,"(horor,)",1,14.738409
96778,"(angelane,)",1,14.738409
96779,"(captainmarlene,)",2,29.476818


In [668]:
tfidf_sum = ogdf['tfidf'].sum()

In [669]:
ogdf['tfidf_norm'] = ogdf['tfidf'].apply(lambda x: x/ogdf['tfidf'].sum())

In [670]:
ogdf['PCiQt'] = ogdf['tfidf_norm']

In [671]:
ogdf

Unnamed: 0,word,freq,tfidf,tfidf_norm,PCiQt
0,"(play,)",18218,24917.151544,1.204736e-03,1.204736e-03
1,"(world's,)",613,3451.119210,1.668604e-04,1.668604e-04
2,"(number,)",1919,8015.895258,3.875658e-04,3.875658e-04
3,"(1,)",4332,14348.208766,6.937310e-04,6.937310e-04
4,"(online,)",4509,14955.767141,7.231063e-04,7.231063e-04
...,...,...,...,...,...
96776,"(6120,)",1,14.738409,7.125971e-07,7.125971e-07
96777,"(horor,)",1,14.738409,7.125971e-07,7.125971e-07
96778,"(angelane,)",1,14.738409,7.125971e-07,7.125971e-07
96779,"(captainmarlene,)",2,29.476818,1.425194e-06,1.425194e-06


# Probability of phrase

## OneGrams

In [672]:
ogdf_avg_freq = math.log2(ogdf['freq'].mean())
ogdf_avg_freq

5.233213074870589

In [673]:
ogdf['freqnorm'] = ogdf['freq'].apply(lambda x: x/ogdf_avg_freq)

In [674]:
ogdf_freqnorm_sum = ogdf['freqnorm'].sum()
ogdf_freqnorm_sum

695625.9850149562

In [675]:
ogdf['prob'] = ogdf['freqnorm'].apply(lambda x: x/bigdf_freqnorm_sum)

In [676]:
#ogdf['word'] = ogdf['word'].apply(lambda x: x[0])
ogdf

Unnamed: 0,word,freq,tfidf,tfidf_norm,PCiQt,freqnorm,prob
0,"(play,)",18218,24917.151544,1.204736e-03,1.204736e-03,3481.226493,5.216534e-03
1,"(world's,)",613,3451.119210,1.668604e-04,1.668604e-04,117.136450,1.755262e-04
2,"(number,)",1919,8015.895258,3.875658e-04,3.875658e-04,366.696325,5.494856e-04
3,"(1,)",4332,14348.208766,6.937310e-04,6.937310e-04,827.789723,1.240423e-03
4,"(online,)",4509,14955.767141,7.231063e-04,7.231063e-04,861.612156,1.291105e-03
...,...,...,...,...,...,...,...
96776,"(6120,)",1,14.738409,7.125971e-07,7.125971e-07,0.191087,2.863396e-07
96777,"(horor,)",1,14.738409,7.125971e-07,7.125971e-07,0.191087,2.863396e-07
96778,"(angelane,)",1,14.738409,7.125971e-07,7.125971e-07,0.191087,2.863396e-07
96779,"(captainmarlene,)",2,29.476818,1.425194e-06,1.425194e-06,0.382174,5.726791e-07


In [677]:
pciqt = dict(zip(ogdf['word'], ogdf['PCiQt']))
ppjci = dict(zip(ogdf['word'], ogdf['prob']))

In [681]:
ogdf['word'] = ogdf['word'].apply(lambda x: x[0])

## BiGrams

In [443]:
bigdf = pd.DataFrame.from_dict(bi_grams, orient='index', columns=['freq'])
bigdf = bigdf.reset_index()
bigdf = bigdf.rename(columns={"index": "word"})

In [447]:
bigdf_avg_freq = math.log2(bigdf['freq'].mean())
bigdf_avg_freq

2.244014267498927

In [448]:
bigdf['freqnorm'] = bigdf['freq'].apply(lambda x: x/bigdf_avg_freq)

In [450]:
bigdf_freqnorm_sum = bigdf['freqnorm'].sum()
bigdf_freqnorm_sum

667344.6874600659

In [452]:
bigdf['prob'] = bigdf['freqnorm'].apply(lambda x: x/bigdf_freqnorm_sum)

In [678]:
ppjci.update(dict(zip(bigdf['word'], bigdf['prob'])))

## TriGrams

In [466]:
tdf = pd.DataFrame.from_dict(tri_grams, orient='index', columns=['freq'])
tdf = tdf.reset_index()
tdf = tdf.rename(columns={"index": "word"})

In [467]:
tdf_avg_freq = math.log2(tdf['freq'].mean())
tdf_avg_freq

1.6024929800929582

In [468]:
tdf['freqnorm'] = tdf['freq'].apply(lambda x: x/tdf_avg_freq)

In [469]:
tdf_freqnorm_sum = tdf['freqnorm'].sum()
tdf_freqnorm_sum

242012.29260767365

In [470]:
tdf['prob'] = tdf['freqnorm'].apply(lambda x: x/tdf_freqnorm_sum)

In [679]:
ppjci.update(dict(zip(tdf['word'], tdf['prob'])))
ppjci

{('play',): 0.005216534361420871,
 ("world's",): 0.00017552615893901601,
 ('number',): 0.0005494856427470991,
 ('1',): 0.0012404230351122632,
 ('online',): 0.001291105139732501,
 ('action',): 0.0019313604274774274,
 ('game',): 0.017232774589603265,
 ('engage',): 0.00018697974190404152,
 ('incredibly',): 8.103409947755552e-05,
 ('realistic',): 0.0005672386963428886,
 ('brand',): 0.00026114169160258173,
 ('terrorist',): 3.836950293283548e-05,
 ('warfare',): 0.00015977748236210593,
 ('wildly',): 2.0616449337045925e-05,
 ('popular',): 0.00021160494527884636,
 ('team',): 0.0014039229319380024,
 ('based',): 0.0019588490265934886,
 ('ally',): 5.240014206499173e-05,
 ('teammates',): 5.411817950974556e-05,
 ('complete',): 0.001126746224184385,
 ('strategic',): 0.0003470435638402731,
 ('missions',): 0.0008112000134979321,
 ('take',): 0.002458225243868601,
 ('enemy',): 0.0013792977285631974,
 ('sites',): 3.8083163358709834e-05,
 ('rescue',): 0.0002562739188424459,
 ('hostages',): 2.11891284852972

In [680]:
ppjci[('single','player')]

0.0013802719275928176

## Setting up final functions

### Series with just single words

In [705]:
words = ogdf[['word']]
def complete_words(s):
    return words[words['word'].str.startswith(s)]['word'].to_list()
complete_words('ga')

['game',
 'games',
 'gaming',
 'gamer',
 'gameplay',
 'gamers',
 'gas',
 "game's",
 'gaining',
 "gamer's",
 "gallean's",
 'gallean',
 'galaxy',
 'gallop',
 'gain',
 'gates',
 'gatekeeper',
 'gatling',
 'gardna',
 'gargantuan',
 'gamespy',
 'gases',
 'gathering',
 'gauntlet',
 'garden',
 "garry's",
 "gaming's",
 'galactic',
 'gamespot',
 'gasses',
 'gap',
 'garrett',
 'gadgetry',
 'gangsters',
 'gandohar',
 "gandohar's",
 'gateways',
 'gangs',
 'gameranger',
 'gaps',
 'gamepad',
 'gateway',
 'gather',
 'gains',
 'garbage',
 'gamestates',
 'gay',
 'gasoline',
 'gameworld',
 'gambit',
 'gardes',
 'gave',
 'gallery',
 'garlic',
 'ganymede',
 "gametunnel's",
 'galore',
 'gang',
 'gangbangers',
 'gadget',
 'gags',
 'gambling',
 'gained',
 'gametypes',
 'gametype',
 'gadgets',
 'gato',
 'gabbit',
 'gabbiar',
 'gamespeak',
 'gardening',
 'gatherers',
 'galaxies',
 'gamemodes',
 'gabe',
 'gamerankings',
 'garage',
 'garnered',
 "gathering's",
 'garfield',
 "games'",
 'gaul',
 'galahad',
 'gathe

In [606]:
def PQcPi(Qc, Pi):
    if Qc not in doc_freq:
        return doc_freq[Pi]
    else:
        return doc_freq[Pi]/doc_freq[Qc]

In [521]:
PQcPi(('action', 'game'), ('online','action','game'))

0.006369426751592357

In [745]:
def remove_trailing_stopwords(tple):
    for i, e in reversed(list(enumerate(tple))):
        print(i)
        if e not in stopWords:
            break
    return tuple(list(tple)[:i+1])

In [751]:
remove_trailing_stopwords(('horror',))

0


('horror',)

In [866]:
from collections import Counter
def query_suggestions(query=''):
    candidates = {}
    tokens = create_tokens(query)
    orig_query = tuple(tokens)
    c_words = set()
    if not query.endswith(" "):
        partial_word = tokens[-1:]
        tokens = tokens[:-1]
        c_words = set(complete_words(partial_word[0]))
        c_words = c_words.difference(set(tokens))
    print("tokens: ", tokens)
    tple = tuple(tokens)
    if len(tple) > 0:
        fixed_tple = remove_trailing_stopwords(tple)
    else:
        fixed_tple = tple
    st = set(tple)
    for k,v in doc_freq.items():
        c = c_words & set(k)
        if st.issubset(k) and k!=orig_query and (len(c_words) == 0 or (c_words & set(k))):
            candidates[k] = PQcPi(fixed_tple,k)
            if len(c) > 0:
                candidates[k] *= pciqt[(list(c)[0],)] * ppjci[k]
        
    sugs = Counter(candidates).most_common(10)
        
    return sugs

In [867]:
pciqt[('world',)],pciqt[('experience',)]

(0.0013911615418473508, 0.0009773337075002426)

In [877]:
query_suggestions('screen ti')

tokens:  ['screen']
0


[(('title', 'screen'), 8.679396266018545e-11),
 (('screen', 'at', 'the', 'same', 'time'), 1.2102992064152576e-11),
 (('one', 'screen', 'at', 'a', 'time'), 7.477475367401099e-12),
 (('screen', 'at', 'a', 'time'), 4.3570771430949275e-12),
 (('returns', 'to', 'the', 'title', 'screen'), 2.992359403287946e-12),
 (('return', 'to', 'the', 'title', 'screen'), 1.3299375125724206e-12),
 (('edition', 'title', 'screen'), 6.649687562862103e-13),
 (('title', 'screen', 'and', 'includes'), 6.649687562862103e-13),
 (('button', 'on', 'the', 'title', 'screen'), 6.649687562862103e-13)]

In [869]:
set(('single', 'player')).issubset(('single','games','player'))

True

In [612]:
doc_freq[('how', )]

KeyError: ('the', 'world')

In [54]:
ogdf[ogdf['word'].str.startswith('ga')]['word'].to_list()

NameError: name 'ogdf' is not defined

In [55]:
stopwordsdf = pd.DataFrame()

In [59]:
stopwordsdf['word'] = list(stopWords)

In [62]:
words = words.append(stopwordsdf)

In [64]:
words = words.reset_index()
words

Unnamed: 0,index,word
0,0,play
1,1,world's
2,2,number
3,3,1
4,4,online
...,...,...
96955,174,was
96956,175,both
96957,176,wasn
96958,177,no


In [92]:
doc_freq_df = pd.DataFrame.from_dict(doc_freq, orient='index', columns=['freq']).reset_index().rename(columns={"index": "word"})

In [94]:
doc_freq_df['join'] = doc_freq_df['word'].apply(lambda x: ' '.join(map(str, x)))

In [138]:
doc_freq

Unnamed: 0,word,freq,join
0,"(1, online, action)",2,1 online action
1,"(team, based, game)",12,team based game
2,"(online, action, game)",3,online action game
3,"(online, action, games)",3,online action games
4,"(action, games, of, all, time)",3,action games of all time
...,...,...,...
540615,"(machinists,)",1,machinists
540616,"(horor,)",1,horor
540617,"(angelane,)",1,angelane
540618,"(captainmarlene,)",1,captainmarlene


In [141]:
doc_freq_df[doc_freq_df['join'].str.contains('single player')]['word'].to_list()

[('single', 'player', 'action'),
 ('single', 'player', 'missions'),
 ('bonus', 'single', 'player'),
 ('single', 'player', 'game'),
 ('new', 'single', 'player'),
 ('single', 'player', 'co'),
 ('addictive', 'single', 'player'),
 ('extensive', 'single', 'player'),
 ('single', 'player', 'campaigns'),
 ('4', 'single', 'player'),
 ('single', 'player', 'levels'),
 ('32', 'single', 'player'),
 ('single', 'player', 'campaign'),
 ('driven', 'single', 'player'),
 ('linear', 'single', 'player'),
 ('single', 'player', 'and', 'multiplayer'),
 ('massive', 'single', 'player'),
 ('single', 'player', 'skirmish'),
 ('through', 'a', 'rich', 'single', 'player'),
 ('single', 'player', 'experience'),
 ('a', 'cinematic', 'single', 'player'),
 ('deep', 'single', 'player'),
 ('single', 'player', 'option'),
 ('24', 'single', 'player'),
 ('enhanced', 'single', 'player'),
 ('single', 'player', 'career'),
 ('gripping', 'single', 'player'),
 ('single', 'player', 'modes'),
 ('multiplayer', 'and', 'single', 'player'),

In [146]:
phrases = doc_freq_df[['word','join']]
phrases

Unnamed: 0,word,join
0,"(1, online, action)",1 online action
1,"(team, based, game)",team based game
2,"(online, action, game)",online action game
3,"(online, action, games)",online action games
4,"(action, games, of, all, time)",action games of all time
...,...,...
540615,"(machinists,)",machinists
540616,"(horor,)",horor
540617,"(angelane,)",angelane
540618,"(captainmarlene,)",captainmarlene


# Save Data that is needed for final calculations
doc_freq, words, pciqt, ppjci

In [795]:
import pickle

In [160]:
pickle.dump(doc_freq, open("doc_freq.p", "wb")) 
pickle.dump(words, open("words.p", "wb"))
pickle.dump(pciqt, open("pciqt.p", "wb"))
pickle.dump(ppjci, open("ppjci.p", "wb"))
pickle.dump(phrases, open("phrases.p", "wb"))

# What's needed for a standalone python file

In [1]:
import pandas as pd
import pickle
from collections import Counter
import re
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

phrases = pickle.load(open("phrases.p", "rb"))
doc_freq = pickle.load(open("doc_freq.p", "rb"))
words = pickle.load(open("words.p", "rb"))
pciqt = pickle.load(open("pciqt.p", "rb"))
ppjci = pickle.load(open("ppjci.p", "rb"))

def create_tokens(s):
    return [token for token in s.split(" ") if token != ""]

def complete_words(s):
    return words[words['word'].str.startswith(s)]['word'].to_list()

def remove_trailing_stopwords(tple):
    for i, e in reversed(list(enumerate(tple))):
        if e not in stopWords:
            break
    return tuple(list(tple)[:i+1])

def PQcPi(Qc, Pi):
    if Qc not in doc_freq:
        return doc_freq[Pi]
    else:
        return doc_freq[Pi]/doc_freq[Qc]

def query_suggestions(query=''):
    if len(query) < 3:
        return []
    query = query.lower()
    query = re.sub(r'’', '\'', query)
    query = re.sub(r'[^a-zA-Z0-9\s\']', ' ', query)
    candidates = {}
    tokens = create_tokens(query)
    orig_query = tuple(tokens)
    c_words = set()
    if not query.endswith(" "):
        partial_word = tokens[-1:]
        tokens = tokens[:-1]
        c_words = set(complete_words(partial_word[0]))
        c_words = c_words.difference(set(tokens)).union({partial_word[0]})
    tple = tuple(tokens)
    if len(tple) > 0:
        fixed_tple = remove_trailing_stopwords(tple)
    else:
        fixed_tple = tple
    st = set(tple)
    for k in phrases[phrases['join'].str.contains(query)]['word'].to_list():
        c = c_words & set(k)
        if st.issubset(k) and k!=orig_query and (len(c_words) == 0 or (c_words & set(k))):
            candidates[k] = PQcPi(fixed_tple,k) * ppjci[k]
            if len(c) > 0 and (list(c)[0],) in pciqt :
                candidates[k] *= pciqt[(list(c)[0],)]
        
    sugs = Counter(candidates).most_common(10)
        
    return [' '.join(map(str, tup[0])) for tup in sugs]

In [2]:
query_suggestions('single')

['single player',
 'single player campaign',
 'single player mode',
 'single player game',
 'every single',
 'features single player',
 'single player story',
 'single player experience',
 'single player missions',
 'features single']