# Steam Descriptions

Reference: https://github.com/woctezuma/steam-descriptions

## Setting

In [1]:
from google.colab import drive

mount_folder = '/content/gdrive'
drive.mount(mount_folder)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd '/content/gdrive/My Drive/'
!rm -rf steam-descriptions/
!git clone https://github.com/woctezuma/steam-descriptions.git
%cd steam-descriptions/

/content/gdrive/My Drive
Cloning into 'steam-descriptions'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 592 (delta 10), reused 10 (delta 3), pack-reused 566[K
Receiving objects: 100% (592/592), 69.79 MiB | 13.49 MiB/s, done.
Resolving deltas: 100% (343/343), done.
Checking out files: 100% (18/18), done.
/content/gdrive/My Drive/steam-descriptions


In [3]:
!pip install -r requirements.txt



## Load data

In [4]:
%cd '/content/gdrive/My Drive/steam-descriptions/'
!ls data/

/content/gdrive/My Drive/steam-descriptions
aggregate_prettyprint.json  README.md  tokens.json


In [0]:
from utils import load_raw_data
from gensim.parsing.preprocessing import strip_tags, remove_stopwords
from gensim.utils import simple_preprocess

In [0]:
steam_sentences = load_raw_data()

In [0]:
pre_processed_steam_sentences = dict()

for app_id in steam_sentences:
  game_data = steam_sentences[app_id]
  
  original_str = str(strip_tags(game_data['text']))

  original_str = original_str.replace('\t', ' ')

  # Reference: https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
  original_str = original_str.strip().replace('\n', ' ').replace('\r', ' ')
  original_str = original_str.replace('&amp;', 'and').replace('&gt;', '>').replace('&lt;', '<')
  
  pre_processed_steam_sentences[app_id] = original_str  

In [8]:
len(pre_processed_steam_sentences)

30885

## Tensorflow Hub

References:
-   https://tfhub.dev/google/universal-sentence-encoder/2
-   https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb

In [0]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install --quiet seaborn

In [10]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

W0307 21:29:40.314654 140471953299328 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [0]:
# Compute a representation for each message, showing various lengths supported.
app_id_list = []
messages = []
for app_id in sorted(pre_processed_steam_sentences, key=int):
  app_id_list.append(app_id)
  messages.append(pre_processed_steam_sentences[app_id])

In [13]:
# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

Instructions for updating:
Colocations handled automatically by placer.


W0307 21:30:06.900073 140471953299328 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [0]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [0]:
num_elements_per_chunk = 1000

for chunk_no, message_chunk in enumerate(chunks(messages, num_elements_per_chunk)):
  print('Chunk n°{}'.format(chunk_no))

  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embed(message_chunk))

    for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
      print("Message: {}".format(message_chunk[i]))
      print("Embedding size: {}".format(len(message_embedding)))
      message_embedding_snippet = ", ".join(
          (str(x) for x in message_embedding[:3]))
      print("Embedding: [{}, ...]\n".format(message_embedding_snippet))
      if i>3:
        break
  
  np.save('universal-sentence-encoder-features_'+str(chunk_no)+'.npy', message_embeddings)

In [0]:
import json

with open('universal-sentence-encoder-appids.txt', 'w', encoding='utf-8') as f:
  print(app_id_list, file=f)

In [22]:
num_samples = len(messages)
num_features = 512

message_embeddings = np.zeros((num_samples, num_features))

for chunk_no, _ in enumerate(chunks(messages, num_elements_per_chunk)):
  current_message_embeddings = np.load('universal-sentence-encoder-features_'+str(chunk_no)+'.npy')
  
  my_start = chunk_no*num_elements_per_chunk
  my_end = min(num_samples, (chunk_no+1)*num_elements_per_chunk)
  
  message_embeddings[my_start:my_end, :] = current_message_embeddings
  print('Chunk n°{}: [{}, {}['.format(chunk_no, my_start, my_end))    
  
np.save('universal-sentence-encoder-features.npy', message_embeddings)  

Chunk n°0: [0, 1000[
Chunk n°1: [1000, 2000[
Chunk n°2: [2000, 3000[
Chunk n°3: [3000, 4000[
Chunk n°4: [4000, 5000[
Chunk n°5: [5000, 6000[
Chunk n°6: [6000, 7000[
Chunk n°7: [7000, 8000[
Chunk n°8: [8000, 9000[
Chunk n°9: [9000, 10000[
Chunk n°10: [10000, 11000[
Chunk n°11: [11000, 12000[
Chunk n°12: [12000, 13000[
Chunk n°13: [13000, 14000[
Chunk n°14: [14000, 15000[
Chunk n°15: [15000, 16000[
Chunk n°16: [16000, 17000[
Chunk n°17: [17000, 18000[
Chunk n°18: [18000, 19000[
Chunk n°19: [19000, 20000[
Chunk n°20: [20000, 21000[
Chunk n°21: [21000, 22000[
Chunk n°22: [22000, 23000[
Chunk n°23: [23000, 24000[
Chunk n°24: [24000, 25000[
Chunk n°25: [25000, 26000[
Chunk n°26: [26000, 27000[
Chunk n°27: [27000, 28000[
Chunk n°28: [28000, 29000[
Chunk n°29: [29000, 30000[
Chunk n°30: [30000, 30885[
