<a href="https://colab.research.google.com/github/susandong/w266_final_project_game_sentiment/blob/master/w266_Final_Project_Game_Review_Sentiment_Analysis_sj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project: Game Review Sentiment Analysis Over Time
## Research Question: 
* Can we use sentiment analysis score to predict the active user base for video games over time

## Dataset: 
* Game Review: twitter/reddit/discord/steam reviews
* active user base: steam

## Algorithm: 
* Baseline(logistic Regression); 
* Transformer(Elmo/Bert)


In [1]:
#Load libraries
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
""" Download Data: There are 4 games with review data
Fall Guys (fg)
PlayerUnknown Battlegrounds (pubg)
Dota 2 (dota2)
Counterstrike Source: Go (csgo)

Review data has the following columns:
app: ID for the game
useful: how many users voted the review as useful
funny: how many users voted the review as funny
username: username of the person who wrote the review
games_owned: how many games the reviewer owns on Steam
num_reviews: how many reviews the reviewer has written on Steam
recommend: 1 for recommend (thumbs up), -1 for do not recommend (thumbs down)
hours_played: number of hours the reviewer played before writing the review
date: date review was written
text: text of the review
"""
#Fall Guys
fg_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/fallguys_reviews.csv'
fg_df = pd.read_csv(fg_url, error_bad_lines=False)
fg_df = fg_df.dropna()

#CS: Go
csgo_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/csgo_reviews.csv'
csgo_df = pd.read_csv(csgo_url, error_bad_lines=False)
csgo_df = csgo_df.dropna()

#PUBG
pubg_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/pubg_reviews.csv'
pubg_df = pd.read_csv(pubg_url, error_bad_lines=False)
pubg_df = pubg_df.dropna()

#dota2_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/dota2_reviews.csv'
#dota2_df = pd.read_csv(dota2_url, error_bad_lines=False)
#player_url = 'https://raw.githubusercontent.com/susandong/w266_final_project_game_sentiment/master/data/PlayerCountData.csv'
#player_df = pd.read_csv(player_url, error_bad_lines=False)

In [3]:
# Data Preprocessing
!pip install unidecode
import unidecode
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
#from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

tknzr = TweetTokenizer()
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
#lemma = WordNetLemmatizer()

#Convert accented characters
def remove_accents(text):
  try:
    text = unidecode.unidecode(text)
  except:
    pass
  return text

#Remove digits and punctuation
def remove_nonletters(text):
  try:
    #Remove digits AND punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    #Remove just digits that are by themselves
    #text = re.sub('^\d+\s|\s\d+\s|\s\d+$', ' ', text)
  except:
    pass
  return text


#Use Tweet Tokenizer for some built-in emoji support
def tweet_tokenization(text):
  try:
    return tknzr.tokenize(text)
  except:
    pass

def remove_stopwords(token):
  try:
    return [item for item in token if item not in stop_words]
  except:
    pass

def remove_stemmer(token):
  try:
    return [stemmer.stem(i) for i in token]
  except:
    pass

def remove_lemmatizer(token):
  try:
    return [lemma.lemmatize(word=w, pow='v') for w in token]
  except:
    pass
  
def remove_shortwords(token):
  try:
    return [i for i in token if len(i) > 1]
  except:
    pass

def token_to_string(listTokens):
  return ' '.join(listTokens)

#Process text from dataframe. df = dataframe to clean, text = name of column with text
def process_text(df, text):  
  #Create new column for cleaned text
  df['cleaned'] = df[text]

  #Lower case all text
  df['cleaned'] = df['cleaned'].str.lower()

  #Clean URLs
  df['cleaned'] = df['cleaned'].str.replace('http\S+|www.\S+', '', case=False)

  #Remove accents from text
  #df['cleaned'] = df['cleaned'].apply(remove_accents)

  #Remove numbers and punctuation from text
  df['cleaned'] = df['cleaned'].apply(remove_nonletters)
  
  #Tokenize
  df['cleaned'] = df['cleaned'].apply(tweet_tokenization)

  #Remove stopwords
  #df['cleaned'] = df['cleaned'].apply(remove_stopwords)
  
  #Remove short words
  df['cleaned'] = df['cleaned'].apply(remove_shortwords)

  #Stemming - can decide to use or not
  #df['cleaned'] = df['cleaned'].apply(remove_stemmer)

  #Convert tokens back to string
  df['cleaned'] = df['cleaned'].apply(token_to_string)


Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |█▍                              | 10kB 27.6MB/s eta 0:00:01[K     |██▊                             | 20kB 3.5MB/s eta 0:00:01[K     |████▏                           | 30kB 4.6MB/s eta 0:00:01[K     |█████▌                          | 40kB 4.9MB/s eta 0:00:01[K     |██████▉                         | 51kB 4.0MB/s eta 0:00:01[K     |████████▎                       | 61kB 4.5MB/s eta 0:00:01[K     |█████████▋                      | 71kB 4.9MB/s eta 0:00:01[K     |███████████                     | 81kB 5.4MB/s eta 0:00:01[K     |████████████▍                   | 92kB 5.6MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 5.4MB/s eta 0:00:01[K     |███████████████▏                | 112kB 5.4MB/s eta 0:00:01[K     |████████████████▌               | 122kB 5.4MB/

In [4]:
#preprocess all the datasets for all 3 games
process_text(fg_df, 'text')
#len(fg_df['cleaned'][11]) < 2
process_text(csgo_df, 'text')
process_text(pubg_df, 'text')

In [5]:
fg_df

Unnamed: 0,app,useful,funny,username,games_owned,num_reviews,recommend,hours_played,date,text,cleaned
0,1097150,0,0,7.65612E+16,51,16,1,17.1,"11 October, 2020",ow i fell:( thats a sad face btwincase you did...,ow fell thats sad face btwincase you didnt kno...
1,1097150,0,0,7.65612E+16,1,1,1,50.4,"11 October, 2020",yes,yes
2,1097150,0,0,7.65612E+16,64,3,-1,8.1,"11 October, 2020",This Game is not fun. If your looking for a ga...,this game is not fun if your looking for game ...
3,1097150,0,0,floolp,1,1,1,15.3,"11 October, 2020",Fun but VERY HARD game!this is a very fun game...,fun but very hard game this is very fun game y...
4,1097150,0,0,7.65612E+16,6,1,1,34.9,"11 October, 2020",its fun,its fun
...,...,...,...,...,...,...,...,...,...,...,...
90653,1097150,3,0,7.65612E+16,0,1,-1,2.7,"8 September, 2020",A TOTAL GARBAGE!!i wish i never bought it and ...,total garbage wish never bought it and now wan...
90654,1097150,0,0,quazwaz,15,6,1,16.5,"8 September, 2020",this game is so simple my mom likes it 10/10,this game is so simple my mom likes it
90655,1097150,1,0,Elbourne,407,1,-1,18.7,"8 September, 2020","ruined by hackers and refused a refund, avoid.",ruined by hackers and refused refund avoid
90656,1097150,0,0,7.65612E+16,8,1,1,14.7,"8 September, 2020",very good game and fun to play,very good game and fun to play


In [6]:
#concatenate all 3 games data to one large dataset
all_df=pd.concat([fg_df,csgo_df,pubg_df])
all_clean_df=all_df[["recommend","cleaned"]]


In [7]:
from nltk.tokenize import RegexpTokenizer
def remove_blankrow(df,column):
  tokenizer = RegexpTokenizer(r'\w+')
  df['lens'] =[len(tokenizer.tokenize(utterance)) for utterance in df[column]]
  df_clean=df[df['lens'] !=0]
  return df_clean

In [8]:
df_final=remove_blankrow(all_clean_df,'cleaned')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [9]:
df_final.shape

(433197, 3)

In [10]:
all_clean_df.shape

(452858, 3)

In [11]:
#check the length of reviews
print(df_final.lens.describe())
print(df_final['lens'].quantile(0.999))
#remove the extremely long reviews
df_final=df_final[df_final.lens<(df_final['lens'].quantile(0.999))]

count    433197.000000
mean         24.593351
std          60.585161
min           1.000000
25%           2.000000
50%           6.000000
75%          21.000000
max        2667.000000
Name: lens, dtype: float64
727.8040000000619


In [12]:
#check negative data vs positive data
df_final.recommend.value_counts()

 1    315457
-1    117306
Name: recommend, dtype: int64

In [14]:
#balance the negative and positive data
from sklearn.utils import resample

df_majority = df_final[df_final.recommend==1]
df_minority = df_final[df_final.recommend==-1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=315457,    # to match majority class
                                 random_state=235) # reproducible results
 
# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])
 

#replace outcome label -1 with 0
df_balanced['recommend'] = df_balanced['recommend'].replace([-1],0)
# Display new class counts
df_balanced.recommend.value_counts()

1    315457
0    315457
Name: recommend, dtype: int64

In [15]:
#save the balanced data to the disk
df_balanced.to_csv("../data/balanced_clean_data_1028.csv")

FileNotFoundError: ignored

In [None]:
# Build model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced.cleaned,df_balanced.recommend,
test_size=0.2)


In [None]:
#Bag of Words + XGBoost
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
#Start with Bag of Words
vectorizer = CountVectorizer(analyzer = "word", max_features=1500, min_df=.01, max_df=0.7) 
bow_train = vectorizer.fit_transform(X_train).toarray()

In [None]:
#XGB classifier
BOW_XGB = xgb.XGBClassifier(max_depth=7, n_estimators=300, objective="binary:logistic", 
                            random_state=1, tree_method='gpu_hist', predictor='gpu_predictor')
BOW_XGB_scores = cross_val_score(BOW_XGB, bow_train, y_train, cv=3, n_jobs=-1)
print("Averaged CV Accuracy: %0.5f" % BOW_XGB_scores.mean())

BOW_XGB.fit(bow_train, y_train)


Averaged CV Accuracy: 0.80878


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=None, objective='binary:logistic',
              predictor='gpu_predictor', random_state=1, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, tree_method='gpu_hist', verbosity=1)

In [None]:
#bow_test = (vectorizer.transform(X_test)).toarray()
XGB_pred = BOW_XGB.predict(bow_test)

NameError: ignored

In [None]:
BOW_XGB_scores_test = cross_val_score(BOW_XGB, bow_test, y_test, cv=3, n_jobs=-1)
print("Averaged CV Accuracy: %0.5f (+/- %0.5f)" % (BOW_XGB_scores_test.mean(), BOW_XGB_scores_test.std() * 2))

In [None]:
# Simple CNN
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np


max_features = 200000  # Only consider the top 20k words
maxlen = 500
embedding_dim = 16


In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re
layer = TextVectorization()




In [None]:
#layer.adapt(X_train.values)
vectorized_X_train = layer(X_train.values)
vectorized_X_test = layer(X_test.values)
print(vectorized_X_train)


In [None]:
X_train.values

In [None]:
from tensorflow.keras import layers

model = tf.keras.models.Sequential()
model.add(layers.Embedding(max_features + 1, embedding_dim))
#model.layers[0].trainable = False
#model.add(layers.Dropout(0.05))
model.add(layers.GlobalAveragePooling1D())
#model.add(layers.Dropout(0.05))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

model.compile(loss="binary_crossentropy", optimizer='adam', metrics="accuracy")

In [None]:
#train model
#epochs = 10
#history = model.fit(
#   vectorized_X_train,y_train,
#   epochs=epochs,batch_size=32)

In [None]:
# evaluate model
#model.evaluate(vectorized_X_test, y_test, batch_size=512)

In [None]:
#BERT model

In [None]:
tf.config.set_soft_device_placement(True)
tf.debugging.set_log_device_placement(True)

In [None]:


import json
import pandas as pd
import numpy as np
import os
import sys
import tensorflow as tf
from time import time
import io
import re

import pickle
from csv import reader
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

from tensorflow.keras import layers
from tensorflow.keras.backend import sparse_categorical_crossentropy
from tensorflow.keras.layers import Dense, Flatten

from datetime import datetime



In [None]:
!pip install transformers
from transformers import BertTokenizer, TFBertModel



In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
class GameReviewData:
    DATA_COLUMN = "cleaned"
    LABEL_COLUMN = "recommend"

    def __init__(self, train, test, tokenizer, classes, max_seq_len=192):
        self.tokenizer = tokenizer
        self.max_seq_len = 0
        self.classes = classes
    
        ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, df):
        x, y = [], []
    
        for _, row in tqdm(df.iterrows()):
            text, label = row[GameReviewData.DATA_COLUMN], row[GameReviewData.LABEL_COLUMN]
            tokens = self.tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            self.max_seq_len = max(self.max_seq_len, len(token_ids))
            x.append(token_ids)
            y.append(self.classes.index(label))

        return np.array(x), np.array(y)

    def _pad(self, ids):
        x = []
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
        return np.array(x)




In [None]:
from sklearn.model_selection import train_test_split

train,test= train_test_split(df_balanced,
test_size=0.2)


In [None]:
classes = df_balanced.recommend.unique().tolist()
classes

[1, 0]

In [None]:
from tqdm import tqdm
processed_data= GameReviewData(train,test, tokenizer, [0,1], max_seq_len=50)

503252it [03:33, 2358.49it/s]
125814it [00:53, 2373.34it/s]


max seq_len 1488


In [None]:
processed_data.train_x[0]

array([  101,  1307,  1148,  1338,  1541,  4927,  1894, 13798, 18268,
        2222,  2052,  1400,  1270,  1452,  1156,  1231, 14566,  2354,
        1181,   102,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0])

In [None]:
bert_layer = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
import keras as keras
def create_model(max_seq_len,train_layers):
    
  input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
  
   
  bert_inputs = [input_ids]
# Freeze layers, i.e. only train number of layers specified, starting from the top
    
  if not train_layers == -1:
        
    retrain_layers = []
    
    for retrain_layer_number in range(train_layers):

      layer_code = '_' + str(11 - retrain_layer_number)
      retrain_layers.append(layer_code)

    for w in bert_layer.weights:
      if not any([x in w.name for x in retrain_layers]):
              w._trainable = False

     # End of freezing section

  output         = bert_layer(bert_inputs)[0]
  print(output.shape)
  #cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
  #cls_out = keras.layers.Dropout(0.5)(cls_out)
  logits = keras.layers.Dense(units=256, activation="relu")(output)
  logits = keras.layers.Dropout(0.1)(logits)
  logits = keras.layers.Dense(units=1, activation="sigmoid")(logits)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))



  model.compile(optimizer=keras.optimizers.Adam(learning_rate=2e-5),
                loss="binary_crossentropy",
                metrics=["accuracy"])

  model.summary()

  return model

In [None]:
model = create_model(processed_data.max_seq_len,train_layers=0)

(None, 50, 768)
Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 50)]              0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 50, 768), (None,  108310272 
_________________________________________________________________
dense_6 (Dense)              (None, 50, 256)           196864    
_________________________________________________________________
dropout_40 (Dropout)         (None, 50, 256)           0         
_________________________________________________________________
dense_7 (Dense)              (None, 50, 1)             257       
Total params: 108,507,393
Trainable params: 108,507,393
Non-trainable params: 0
_________________________________________________________________


In [None]:

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

tf.config.set_soft_device_placement(True)
tf.debugging.set_log_device_placement(True)

Num GPUs Available:  1


In [None]:
import datetime
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
log_dir = "log/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

history = model.fit(
  x=processed_data.train_x, 
  y=processed_data.train_y,
  validation_split=0.1,
  batch_size=128,
  shuffle=True,
  epochs=2,
  callbacks=[tensorboard_callback]
)

Epoch 1/2
Epoch 2/2


In [None]:
model.evaluate(processed_data.test_x,processed_data.test_y,batch_size=1048)



[0.6033444404602051, 0.6690759062767029]

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir log