In [1]:
import pandas as pd
import numpy as np
import string
import nltk
import matplotlib.pyplot as plt
import warnings
from nltk import word_tokenize, pos_tag, FreqDist, SnowballStemmer
from nltk.corpus import wordnet as wn, stopwords
from gensim import corpora, models
from operator import itemgetter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from transformers import pipeline
from wordcloud import WordCloud
from nltk.util import ngrams

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
userComments1 = pd.read_csv('C:/Users/Michael/Documents/GitHub/EBAC/dataSources/videoGames/metacritic_game_user_comments (0-100k).csv')
userComments2 = pd.read_csv('C:/Users/Michael/Documents/GitHub/EBAC/dataSources/videoGames/metacritic_game_user_comments (100-200k).csv')
userComments3 = pd.read_csv('C:/Users/Michael/Documents/GitHub/EBAC/dataSources/videoGames/metacritic_game_user_comments (200-300k).csv')

userComments3['Unnamed: 0'] = userComments3['Unnamed: 0'].astype('float64')
userComments3['Userscore'] = userComments3['Userscore'].astype('float64')

userComments = pd.concat([userComments1, userComments2, userComments3], axis = 0)
userComments.drop(columns = 'Unnamed: 0', inplace = True)
userComments.dropna(how = 'all', inplace = True)

userComments.Platform.unique()

platformMap = {
'Nintendo64': 'Nintendo',
'GameBoyAdvance': 'Nintendo',
'3DS': 'Nintendo',
'DS': 'Nintendo',
'Wii': 'Nintendo',
'Switch': 'Nintendo',
'WiiU': 'Nintendo',
'GameCube': 'Nintendo',
'PlayStation': 'PlayStation',
'PlayStation3': 'PlayStation',
'PlayStation2': 'PlayStation',
'PlayStation4': 'PlayStation',
'PlayStationVita': 'PlayStation',
'PSP': 'PlayStation',
'Xbox360': 'Xbox',
'Xbox': 'Xbox',
'XboxOne': 'Xbox',
'PC': 'PC',
'Dreamcast': 'Others',
'not specified': 'Others'
}

userComments['platformCondensed'] = userComments.Platform.map(platformMap)

In [3]:
filter_list = ["would", "could", "left", "right", "a.m.", "p.m.", "'s", "! ! !", "...", ":", ";", "n't",
              "game", "games", "play", "fun", "much", "one", "great", "perfect", "time", "year", "lot", "thing", "etc",
              "hour", "hours", "way", "ways", "everything", "anything", "thing", "review", "year", "years", "feel", "feels",
              "thing", "nothing", "problem", "end", "begin", "kind", "piece", "work", "call", "anyone", "minute", "minutes",
              "waste", "crap", "garbage"]

def preprocess_text(tokens, needtokenizeBoolean = True, grams = False,  ngramsNumber = 2, furtherPreProcessNgrams = False):
    
    if needtokenizeBoolean:
        tokens = nltk.word_tokenize(tokens)
        if grams:
            tokens = list(ngrams(tokens, ngramsNumber))
    if grams:
        tokens = [' '.join(gram) for gram in tokens]
        if furtherPreProcessNgrams == False:
            return tokens

    tokens = [t.lower() for t in tokens]
    tokens = [t for t in tokens if t not in stopwords.words('english') + filter_list]
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [t for t in tokens if not t.isnumeric()]
    tokens = [SnowballStemmer('english').stem(t) for t in tokens]

    return tokens

def wc(df, columnName, preProcessingFunctionBoolean = True, tfidfVectorizerBoolean = True, vectorizerMinDf = 2, vectorizerMaxDf = 0.7, countVectorizerBinary = True,
       nounTaggingBoolean = False, universalNounTagsetBoolean = False, ngrams = False, ngramsNumber = 2, furtherPreProcessNgrams = False,
       top = 10, features_improvements = 'Word Cloud', platform = 'All'):

    if preProcessingFunctionBoolean == True & ngrams == True:
        if tfidfVectorizerBoolean == True:
            if furtherPreProcessNgrams == True:
                tdm = TfidfVectorizer(tokenizer = lambda text: preprocess_text(text, grams = True, ngramsNumber=ngramsNumber, furtherPreProcessNgrams = True), min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
            else:
                tdm = TfidfVectorizer(tokenizer = lambda text: preprocess_text(text, grams = True, ngramsNumber=ngramsNumber), min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
        else:
            if furtherPreProcessNgrams == True:
                tdm = CountVectorizer(binary = countVectorizerBinary, tokenizer = lambda text: preprocess_text(text, grams = True, ngramsNumber=ngramsNumber, furtherPreProcessNgrams = True), min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
            else:
                tdm = CountVectorizer(binary = countVectorizerBinary, tokenizer = lambda text: preprocess_text(text, grams = True, ngramsNumber=ngramsNumber), min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
        tdmMatrix = tdm.fit_transform(df[columnName])
    
    if nounTaggingBoolean == True and ngrams == False:
        noun_list = []
        for value in df[columnName]:
            if universalNounTagsetBoolean == True:
                tagged_value = pos_tag(word_tokenize(value), tagset='universal')
                noun = [word for word, tag in tagged_value if tag == 'NOUN']
            else:
                tagged_value = pos_tag(word_tokenize(value))
                noun = [word for word, tag in tagged_value if tag == 'NN' or tag == 'NNS']
            noun_list.append(noun)
        df['Text_NounOnly'] = noun_list
        if preProcessingFunctionBoolean == True:
            df['Text_NounOnly'] = df['Text_NounOnly'].apply(lambda x: preprocess_text(tokens = x, needtokenizeBoolean = False))
            df['Text_NounOnly'] = df['Text_NounOnly'].apply(lambda x: ', '.join(x))
        else:
            df['Text_NounOnly'] = df['Text_NounOnly'].apply(lambda x: ', '.join(x))
        if tfidfVectorizerBoolean == True:
            tdm = TfidfVectorizer(min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
        else:
            tdm = CountVectorizer(binary = countVectorizerBinary, min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
        tdmMatrix = tdm.fit_transform(df['Text_NounOnly'])
        
    if nounTaggingBoolean != True and ngrams == False:
        if tfidfVectorizerBoolean == True:
            if preProcessingFunctionBoolean == True:
                tdm = TfidfVectorizer(tokenizer = preprocess_text, min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
            else:
                tdm = TfidfVectorizer(min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
        else:
            if preProcessingFunctionBoolean == True:
                tdm = CountVectorizer(binary = countVectorizerBinary, tokenizer = preprocess_text, min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
            else:
                tdm = CountVectorizer(binary = countVectorizerBinary, min_df = vectorizerMinDf, max_df = vectorizerMaxDf)
        tdmMatrix = tdm.fit_transform(df[columnName])

    array = tdmMatrix.toarray()
    feature_names = tdm.get_feature_names_out()
    word_tfidf = dict(zip(feature_names, array.sum(axis=0)))
    fd_tfidf= FreqDist(word_tfidf)
    wc = WordCloud(background_color="white").generate_from_frequencies(fd_tfidf)
    plt.figure()
    plt.suptitle(f"{features_improvements} ({platform})", fontsize = 20, x = 0.5, y = 0.85, fontweight = 'bold', fontname = 'Calibri') 
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    displayList = []
    for x,y in fd_tfidf.most_common(top):
        displayList.append((x, round(y, 2)))

    plt.show()
    print(displayList)

In [4]:
userComments.shape

(283983, 6)

In [5]:
userCommentsTEST = userComments.sample(283983).copy()

In [6]:
userCommentsTEST.Userscore.value_counts().sort_index(ascending = False)

Userscore
10.0    112531
9.0      53489
8.0      30271
7.0      17052
6.0      12036
5.0      10113
4.0       7952
3.0       7118
2.0       5983
1.0       7318
0.0      20120
Name: count, dtype: int64

In [7]:
# Assumption: Game creators will want to focus on problems where they can fix and do better than improving what's already good (more weights on low userscores?)

userCommentsTESTNotNull = userCommentsTEST[userCommentsTEST.Comment.notnull()]

userCommentsTESTNotNull.loc[userCommentsTESTNotNull.Userscore > 9, 'scoreBin'] = 'High'
userCommentsTESTNotNull.loc[(userCommentsTESTNotNull.Userscore >= 3) & (userCommentsTESTNotNull.Userscore <= 9), 'scoreBin'] = 'Medium'
userCommentsTESTNotNull.loc[userCommentsTESTNotNull.Userscore < 3, 'scoreBin'] = 'Low'

userCommentsTESTHigh = userCommentsTESTNotNull[userCommentsTESTNotNull.scoreBin == 'High']
userCommentsTESTLow = userCommentsTESTNotNull[userCommentsTESTNotNull.scoreBin == 'Low']

In [8]:
platformCondensed_list = list(userCommentsTESTNotNull[userCommentsTESTNotNull.scoreBin != 'Medium'].platformCondensed.unique())
userCommentsTESTExtreme_list = [userCommentsTESTHigh, userCommentsTESTLow]

for platform in platformCondensed_list:
    for df in userCommentsTESTExtreme_list:
        print(f"{platform}, {df.reset_index().scoreBin[0]} Userscore; No. of comments: {df[df.platformCondensed == platform].shape[0]}")

PC, High Userscore; No. of comments: 43259
PC, Low Userscore; No. of comments: 18236
PlayStation, High Userscore; No. of comments: 34452
PlayStation, Low Userscore; No. of comments: 7781
Nintendo, High Userscore; No. of comments: 16357
Nintendo, Low Userscore; No. of comments: 1009
Xbox, High Userscore; No. of comments: 18096
Xbox, Low Userscore; No. of comments: 6371
Others, High Userscore; No. of comments: 358
Others, Low Userscore; No. of comments: 19


In [9]:
%%time

# Can adjust the parameters in this function to fine tune and iterate testings. Can adjust score bins, sample size and filter list above too!
# If ngrams = True, preProcessingFunctionBoolean needs to be True

for df in userCommentsTESTExtreme_list:
    for platform in platformCondensed_list:
            if df.scoreBin.max() == 'High':
                features_improvements = 'Key Features'
            elif df.scoreBin.max() == 'Low':
                features_improvements = 'Improvements Needed'
            try:
                wc(df = df[df.platformCondensed == platform], columnName = 'Comment',
                preProcessingFunctionBoolean = True, vectorizerMinDf = 2, vectorizerMaxDf = 0.7,
                countVectorizerBinary = True, tfidfVectorizerBoolean = True,
                ngrams = False, ngramsNumber = 3, furtherPreProcessNgrams = False,
                nounTaggingBoolean = True, universalNounTagsetBoolean = False,
                top = 20, features_improvements = features_improvements, platform = platform)
            except ValueError:
                print(f"\n\nNo Word Cloud for '{features_improvements} ({platform})' due to insufficient sample size (No. of comments = 0 or < vectorizerMinDf).\n\n")
            