In [42]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import os

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
import tensorflow_datasets as tfds

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

import re
from sklearn.decomposition import PCA
from wordcloud import WordCloud
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

#silence TF
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

ValueError: tf.enable_eager_execution must be called at program startup.

In [None]:
df = pd.read_csv('games-features-cleaned.csv')
df.head()

In [None]:
df['Reviews'].unique()

RegEx Preprocessor, Encoding and Encoding Map Functions from Cornelia's Week 10 Notebook

In [None]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [None]:
# define function for token encoder
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encode(text)
    return encoded_text, label

#  wrap the encode function to a TF Operator
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label],
                          Tout=(tf.int64, tf.int64))

Data Preprocessing

In [None]:
df['DetailedDescrip'] = df['DetailedDescrip'].apply(preprocessor)
df['Reviews'] = df['Reviews'].apply(preprocessor)

In [None]:
df_r = df[df['Reviews'] != ' ']
df_r['Reviews'].head()

In [None]:
df_r.shape

In [None]:
df_r['Metacritic'].min()

In [None]:
df_r['Metacritic'].max()

In [None]:
# create sentiment column
df_r['Sentiment'] = np.where(df_r.Metacritic.isin(np.arange(38,66)), 'negative',
                          np.where(df_r.Metacritic.isin(np.arange(67,94)), 'positive', 'unknown'))

# rename sentiment to positive = 1, negative = 0
df_r['Sentiment'] = np.where(df_r.Sentiment.eq('positive'), 1, 0)

In [None]:
df_r[['Metacritic', 'Reviews', 'Sentiment']].head(100)

In [None]:
df_r['Sentiment'].value_counts()

In [None]:
# Metacritic and Sentiment histogram after preprocessing
cols = ['Metacritic', 'Sentiment']
nrows, ncols = 1, 2

f, axs = plt.subplots(1, 2, figsize=(10,3))
for idx, ax in enumerate(axs):
    sns.histplot(
    data=df_r,
    stat="count",
    x = cols[idx],
    ax=ax
    )

    # add labels and ticks
    ax.set(xlabel=cols[idx])
    sns.despine()

Addressing Data Imbalance

In [None]:
# randomly draw 223 examples from each class
temp_positive = df_r[df_r.Sentiment.eq(1)].sample(
    n=223,
    replace=False
)

temp_negative = df_r[df_r.Sentiment.eq(0)]

df_balanced = pd.concat(
    [temp_positive, temp_negative],
    axis=0)

# shuffle df_balanced
df_balanced.sample(frac=1) # frac=1 retains all the data
df_balanced.reset_index(drop=True, inplace=True) # reset index

print('After preprocessing, our data contains', df_balanced.shape[0], 'reviews.')

In [None]:
df_balanced = df_balanced[['Reviews', 'Sentiment']]
df_balanced.head(5)

In [None]:
# define target
target = df_balanced.pop('Sentiment') # .pop removes sentiment col from df

# target and review combined tensors
data_tf = tf.data.Dataset.from_tensor_slices(
    (df_balanced[['Reviews']].values, target.values)
)

In [None]:
# set random seed
tf.random.set_random_seed(0)

# define splits for training, validation, test
splits=[0.6, 0.2, 0.2]

# shuffle data
data_tf = data_tf.shuffle(
    df_balanced.shape[0], reshuffle_each_iteration=False)

data_tf_test = data_tf.take(int(df_balanced.shape[0]*splits[2]))
data_tf_train_valid = data_tf.skip(int(df_balanced.shape[0]*splits[2]))
data_tf_train = data_tf_train_valid.take(int(df_balanced.shape[0]*splits[0]))
data_tf_valid = data_tf_train_valid.skip(int(df_balanced.shape[0]*splits[0]))

In [None]:
# try and except the TF tokenizer

try:
    tokenizer = tfds.features.text.Tokenizer()
except AttributeError:
    tokenizer = tfds.deprecated.text.Tokenizer()

# create an instance of the Counter class
token_counts = Counter()

for example in data_tf_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)

print('Size of training vocabulary:', len(token_counts))