<a href="https://colab.research.google.com/github/susandong/w266_final_project_game_sentiment/blob/master/w266_Final_Project_Game_Review_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project: Game Review Sentiment Analysis Over Time
## Research Question: 
* Can we use sentiment analysis score to predict the active user base for video games over time

## Dataset: 
* Game Review: twitter/reddit/discord/steam reviews
* active user base: steam

## Algorithm: 
* Baseline(logistic Regression); 
* Transformer(Elmo/Bert)


In [131]:
#Load libraries
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [132]:
""" Download Data: There are 4 games with review data
Fall Guys (fg)
PlayerUnknown Battlegrounds (pubg)
Dota 2 (dota2)
Counterstrike Source: Go (csgo)

Review data has the following columns:
app: ID for the game
useful: how many users voted the review as useful
funny: how many users voted the review as funny
username: username of the person who wrote the review
games_owned: how many games the reviewer owns on Steam
num_reviews: how many reviews the reviewer has written on Steam
recommend: 1 for recommend (thumbs up), -1 for do not recommend (thumbs down)
hours_played: number of hours the reviewer played before writing the review
date: date review was written
text: text of the review
"""
#Fall Guys
fg_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/fallguys_reviews.csv'
fg_df = pd.read_csv(fg_url, error_bad_lines=False)
fg_df = fg_df.dropna()

#CS: Go
csgo_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/csgo_reviews.csv'
csgo_df = pd.read_csv(csgo_url, error_bad_lines=False)
csgo_df = csgo_df.dropna()

#PUBG
pubg_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/pubg_reviews.csv'
pubg_df = pd.read_csv(pubg_url, error_bad_lines=False)
pubg_df = pubg_df.dropna()

#dota2_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/dota2_reviews.csv'
#dota2_df = pd.read_csv(dota2_url, error_bad_lines=False)
#player_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/PlayerCountData.csv'
#player_df = pd.read_csv(player_url, error_bad_lines=False)

In [133]:
# Data Preprocessing
!pip install unidecode
import unidecode
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
#from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

tknzr = TweetTokenizer()
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
#lemma = WordNetLemmatizer()

#Convert accented characters
def remove_accents(text):
  try:
    text = unidecode.unidecode(text)
  except:
    pass
  return text

#Remove digits and punctuation
def remove_nonletters(text):
  try:
    #Remove digits AND punctuation
    #text = re.sub('[^a-zA-Z]', ' ', text)
    
    #Remove just digits that are by themselves
    text = re.sub('^\d+\s|\s\d+\s|\s\d+$', ' ', text)
  except:
    pass
  return text


#Use Tweet Tokenizer for some built-in emoji support
def tweet_tokenization(text):
  try:
    return tknzr.tokenize(text)
  except:
    pass

def remove_stopwords(token):
  try:
    return [item for item in token if item not in stop_words]
  except:
    pass

def remove_stemmer(token):
  try:
    return [stemmer.stem(i) for i in token]
  except:
    pass

def remove_lemmatizer(token):
  try:
    return [lemma.lemmatize(word=w, pow='v') for w in token]
  except:
    pass
  
def remove_shortwords(token):
  try:
    return [i for i in token if len(i) > 1]
  except:
    pass

def token_to_string(listTokens):
  return ' '.join(listTokens)

#Process text from dataframe. df = dataframe to clean, text = name of column with text
def process_text(df, text):  
  #Create new column for cleaned text
  df['cleaned'] = df[text]

  #Lower case all text
  df['cleaned'] = df['cleaned'].str.lower()

  #Clean URLs
  df['cleaned'] = df['cleaned'].str.replace('http\S+|www.\S+', '', case=False)

  #Remove accents from text
  df['cleaned'] = df['cleaned'].apply(remove_accents)

  #Remove numbers and punctuation from text
  df['cleaned'] = df['cleaned'].apply(remove_nonletters)
  
  #Tokenize
  df['cleaned'] = df['cleaned'].apply(tweet_tokenization)

  #Remove stopwords
  df['cleaned'] = df['cleaned'].apply(remove_stopwords)
  
  #Remove short words
  df['cleaned'] = df['cleaned'].apply(remove_shortwords)

  #Stemming - can decide to use or not
  #df['cleaned'] = df['cleaned'].apply(remove_stemmer)

  #Convert tokens back to string
  df['cleaned'] = df['cleaned'].apply(token_to_string)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [134]:
#preprocess all the datasets for all 3 games
process_text(fg_df, 'text')
#len(fg_df['cleaned'][11]) < 2
process_text(csgo_df, 'text')
process_text(pubg_df, 'text')

In [135]:
#concatenate all 3 games data to one large dataset
all_df=pd.concat([fg_df,csgo_df,pubg_df])
all_clean_df=all_df[["recommend","cleaned"]]


In [136]:
from nltk.tokenize import RegexpTokenizer
def remove_blankrow(df,column):
  tokenizer = RegexpTokenizer(r'\w+')
  df['lens'] =[len(tokenizer.tokenize(utterance)) for utterance in df[column]]
  df_clean=df[df['lens'] !=0]
  return df_clean

In [137]:
df_final=remove_blankrow(all_clean_df,'cleaned')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [138]:
df_final.shape

(440882, 3)

In [139]:
all_clean_df.shape

(452858, 3)

In [140]:
#check the length of reviews
print(df_final.lens.describe())
print(df_final['lens'].quantile(0.999))
#remove the extremely long reviews
df_final=df_final[df_final.lens<(df_final['lens'].quantile(0.999))]

count    440882.000000
mean         16.772234
std          47.195142
min           1.000000
25%           2.000000
50%           5.000000
75%          14.000000
max        7677.000000
Name: lens, dtype: float64
498.11900000006426


In [141]:
#check negative data vs positive data
df_final.recommend.value_counts()

 1    320468
-1    119973
Name: recommend, dtype: int64

In [142]:
#balance the negative and positive data
from sklearn.utils import resample

df_majority = df_final[df_final.recommend==1]
df_minority = df_final[df_final.recommend==-1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=320468,    # to match majority class
                                 random_state=235) # reproducible results
 
# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])
 

#replace outcome label -1 with 0
df_balanced['recommend'] = df_balanced['recommend'].replace([-1],0)
# Display new class counts
df_balanced.recommend.value_counts()

1    320468
0    320468
Name: recommend, dtype: int64

In [143]:
# Build model

In [144]:
# Simple CNN
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np


max_features = 200000  # Only consider the top 20k words
maxlen = 500
embedding_dim = 16


In [145]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced.cleaned,df_balanced.recommend,
test_size=0.2)


In [146]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re
layer = TextVectorization()




In [147]:
#layer.adapt(X_train.values)
vectorized_X_train = layer(X_train.values)
vectorized_X_test = layer(X_test.values)
print(vectorized_X_train)


tf.Tensor(
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]], shape=(512748, 492), dtype=int64)


In [148]:
X_train.values

array(['fun except team gamemodes hopefully gone soon',
       'good fpr first month two gets repetitive looting dying etc ... want good game play guys dead daylight instead steep learing curve get used game soo much fun',
       'game runs poorly good way practice shooting actual combat servers deathmatch unless want explore regions 100 ping practice means nothing hard game get better without spending thousands hours',
       ...,
       'many issues game sure released 1.0 half game still broken ... would recommend',
       'fun play friends lulz bugged af dsync make even worst',
       'game fun got first win hours lvl'], dtype=object)

In [149]:
from tensorflow.keras import layers

model = tf.keras.models.Sequential()
model.add(layers.Embedding(max_features + 1, embedding_dim))
#model.layers[0].trainable = False
#model.add(layers.Dropout(0.05))
model.add(layers.GlobalAveragePooling1D())
#model.add(layers.Dropout(0.05))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

model.compile(loss="binary_crossentropy", optimizer='adam', metrics="accuracy")

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 16)          3200016   
_________________________________________________________________
global_average_pooling1d_3 ( (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 3,200,033
Trainable params: 3,200,033
Non-trainable params: 0
_________________________________________________________________


In [150]:
#train model
#epochs = 10
#history = model.fit(
 #   vectorized_X_train,y_train,
 #   epochs=epochs,batch_size=32)

In [151]:
# evaluate model
#model.evaluate(vectorized_X_test, y_test, batch_size=512)

In [152]:
#BERT model

In [154]:
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer


In [155]:
tf.config.set_soft_device_placement(True)
tf.debugging.set_log_device_placement(True)

In [157]:


import json
import pandas as pd
import numpy as np
import os
import sys
import tensorflow as tf
from time import time
import io
import re

import pickle
from csv import reader
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

from tensorflow.keras import layers
from tensorflow.keras.backend import sparse_categorical_crossentropy
from tensorflow.keras.layers import Dense, Flatten

from datetime import datetime



In [158]:
!pip install transformers
from transformers import BertTokenizer, TFBertModel



In [160]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [204]:
class GameReviewData:
    DATA_COLUMN = "cleaned"
    LABEL_COLUMN = "recommend"

    def __init__(self, train, test, tokenizer, classes, max_seq_len=192):
        self.tokenizer = tokenizer
        self.max_seq_len = 0
        self.classes = classes
    
        ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, df):
        x, y = [], []
    
        for _, row in tqdm(df.iterrows()):
            text, label = row[GameReviewData.DATA_COLUMN], row[GameReviewData.LABEL_COLUMN]
            tokens = self.tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            self.max_seq_len = max(self.max_seq_len, len(token_ids))
            x.append(token_ids)
            y.append(self.classes.index(label))

        return np.array(x), np.array(y)

    def _pad(self, ids):
        x = []
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
        return np.array(x)




In [195]:
from sklearn.model_selection import train_test_split

train,test= train_test_split(df_balanced,
test_size=0.2)


In [201]:
classes = df_balanced.recommend.unique().tolist()
classes

[1, 0]

In [206]:
from tqdm import tqdm
processed_data= GameReviewData(train,test, tokenizer, [0,1], max_seq_len=500)

512748it [04:56, 1728.76it/s]
128188it [01:14, 1719.26it/s]


max seq_len 2661


In [220]:
processed_data.train_x[0]

array([  101,  1342, 15445,  3209,  6159, 23179,  3447,   119, 14609,
        1160,  1201,  1718, 11030,  1403,  1921,  1352,  1308,  1346,
        2469, 10118,  1346,  1736,  1677,   122,   120,   126,  1265,
        2789,  2860,  1948,  1649,  1712,  1774,  1336, 24545,  1363,
        2383,  1214,  1712,  1280,   102,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [207]:
bert_layer = TFBertModel.from_pretrained('bert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526681800.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [251]:
def create_model(max_seq_len):
    
  input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
  
    
  bert_inputs = [input_ids]
  output         = bert_layer(bert_inputs)[0]

  cls_out = keras.layers.Dropout(0.5)(output)
  logits = keras.layers.Dense(units=256, activation="relu")(cls_out)
  logits = keras.layers.Dropout(0.5)(logits)
  logits = keras.layers.Dense(units=2, activation="softmax")(logits)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))



  model.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])

  model.summary()

  return model

In [252]:
model = create_model(processed_data.max_seq_len)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 500)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 500, 768), (None, 108310272 
_________________________________________________________________
dropout_41 (Dropout)         (None, 500, 768)          0         
_________________________________________________________________
dense_6 (Dense)              (None, 500, 256)          196864    
_________________________________________________________________
dropout_42 (Dropout)         (None, 500, 256)          0         
_________________________________________________________________
dense_7 (Dense)              (None, 500, 2)            514       
Total params: 108,507,650
Trainable params: 108,507,650
Non-trainable params: 0
________________________________________

In [None]:
import datetime
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
log_dir = "log/feedback_detection/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

history = model.fit(
  x=processed_data.train_x, 
  y=processed_data.train_y,
  validation_split=0.1,
  batch_size=128,
  shuffle=True,
  epochs=5,
  callbacks=[tensorboard_callback]
)

Epoch 1/5
