In [72]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/viviancai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [291]:
data = pd.read_csv('10_CounterStrike.csv')

In [292]:
data = data[data['language']=='english']
data = data.drop(columns=['recommendationid', 
                   'timestamp_created', 
                   'timestamp_updated',
                   'steam_purchase',
                  'received_for_free',
                  'written_during_early_access',
                  'author.steamid',
                  'author.num_games_owned',
                  'author.num_reviews',
                  'author.last_played'])

In [300]:
reviews = data['review'].str.replace("[^a-zA-Z#]", " ")
reviews = reviews.astype(str)
reviews = reviews.apply(lambda x: ' '.join([w.lower() for w in x.split() if len(w)>2]))

In [301]:
# Keep noun
is_noun_or_adj = lambda pos: pos[:2]=='NN' or pos[:2]=='JJ'
lemmatizer = WordNetLemmatizer()

def noun_adj_processor(word, pos):
    if pos[:2] == 'NN':
        return lemmatizer.lemmatize(word)
    else:
        return lemmatizer.lemmatize(word, pos='a')

def keep_noun_and_adj(rev):
    rev_new = " ".join([noun_adj_processor(word, pos) for (word,pos) in nltk.pos_tag(rev) if is_noun_or_adj(pos)])
    return rev_new

In [302]:
# Remove stop words
stop_words = stopwords.words('english')
stop_words.extend(['player', 'play', 'people', 'game', 'steam','review',
                  'day', 'year', 'hour', 'minute','time','moment',
                  'world', 'yes', 'lol', 'lmao', 'cool', 'love', 'get',
                  'good', 'great', 'nice', 'best', 'fun', 'awesome',
                   'ever', 'kinda', 'shit', 'yeah', 'new', 'old',
                  'big','small','high','low','many','much','lot',
                  'others','thank','http','thing','everyone','anyone','anything',
                  'everything','cant','dont','guy','hello',
                  'youtube','something','someone','pro','con',
                  'haha', 'hehe','end','nothing','no','one',
                  'fine','first','last','epic','english','bit',
                  'terrible','overall','original','life','bad',
                  'today','fps','gameplay','favorite','com',
                  'man','word','version','pure','experience','www',
                  'please','thanks','little','least'])

In [303]:
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

In [304]:
reviews = [keep_noun_and_adj(r.split()) for r in reviews]
reviews = [remove_stopwords(r.split()) for r in reviews]

# Clean one more time
reviews = [keep_noun_and_adj(r.split()) for r in reviews]
reviews = [remove_stopwords(r.split()) for r in reviews]

In [305]:
reviews = list(filter(None, reviews))
reviews = [list(set(r.split())) for r in reviews]

In [306]:
all_words = [w for r in reviews for w in r]
freq_words = nltk.FreqDist(all_words)
freq_words = dict(filter(lambda elem: elem[1]>5 and elem[0] not in stop_words, freq_words.items()))
freq_words = sorted(freq_words.items(), key=lambda item:(item[1],item[0]), reverse=True)
freq_words

[('classic', 94),
 ('strike', 76),
 ('counter', 67),
 ('server', 40),
 ('gold', 40),
 ('csgo', 40),
 ('childhood', 40),
 ('nostalgia', 34),
 ('graphic', 32),
 ('school', 31),
 ('shooter', 30),
 ('memory', 27),
 ('legendary', 27),
 ('community', 27),
 ('friend', 23),
 ('way', 19),
 ('hard', 19),
 ('nostalgic', 18),
 ('map', 15),
 ('source', 14),
 ('competitive', 14),
 ('offensive', 13),
 ('gun', 13),
 ('global', 13),
 ('bot', 13),
 ('simple', 12),
 ('real', 12),
 ('multiplayer', 12),
 ('mod', 12),
 ('computer', 12),
 ('skill', 11),
 ('online', 11),
 ('jogo', 11),
 ('history', 11),
 ('half', 11),
 ('free', 11),
 ('easy', 11),
 ('valve', 10),
 ('team', 10),
 ('que', 10),
 ('point', 10),
 ('perfect', 10),
 ('door', 10),
 ('active', 10),
 ('sure', 9),
 ('series', 9),
 ('popular', 9),
 ('kid', 9),
 ('full', 9),
 ('dead', 9),
 ('zombie', 8),
 ('weapon', 8),
 ('son', 8),
 ('reason', 8),
 ('person', 8),
 ('match', 8),
 ('lan', 8),
 ('guess', 8),
 ('veteran', 7),
 ('update', 7),
 ('system', 7),


''