In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [t for t in tokens if t not in stop_words]
    return filtered_tokens

def remove_punctuation(tokens): # PROBLEMO
    filtered_tokens = [t for t in tokens if t not in string.punctuation and t != "..."]
    return filtered_tokens

def convert_to_lower(tokens):
    converted_tokens = [t.lower() for t in tokens]
    return converted_tokens

def preprocess(review):
    review_tokens = word_tokenize(review, language='english', preserve_line=False)
    intermediate_tokens = remove_stopwords(review_tokens)
    intermediate_tokens = remove_punctuation(intermediate_tokens)
    preprocessed_tokens = convert_to_lower(intermediate_tokens)
    
    return preprocessed_tokens

def display_reviews(data):
    for row in range(data.shape[0]):
        print(data.loc[row, 'review'], '\n\n')

In [3]:
# load data
review_data = pd.read_csv('data/review_data.csv')

In [4]:
# process data - look further into stemming / lemmatization / noise removal (?)
preprocessed_reviews = review_data.copy()
preprocessed_reviews['review'] = preprocessed_reviews['review'].astype(str).apply(preprocess)

In [5]:
# write back processed data
preprocessed_reviews.to_csv('data/preprocessed_reviews.csv')

In [11]:
# data exploration
print("There are a total of {0} reviews.\n".format(preprocessed_reviews.shape[0]))
print("Was recommended:\n", preprocessed_reviews['voted_up'].value_counts(), sep='')

display(preprocessed_reviews.head())
display_reviews(preprocessed_reviews.head())

There are a total of 10000 reviews.

Was recommended:
True     9074
False     926
Name: voted_up, dtype: int64


Unnamed: 0,steam_id,review,timestamp_created,voted_up
0,76561198066184692,"[survived, deerclop, raid, killed, random, fro...",1434424743,True
1,76561198101781146,"[do, n't, starve, back, fun, ever, here, posit...",1434337736,True
2,76561198041162914,"[favorite, survival, game, i, almost, i, starv...",1466010582,True
3,76561198056008417,"[easily, enjoyable, game, i, ever, played, eac...",1452657532,True
4,76561198143487092,"[great, group, kids, work, together, gives, so...",1467918656,True


['survived', 'deerclop', 'raid', 'killed', 'random', 'frog', '10frograin/10'] 


['do', "n't", 'starve', 'back', 'fun', 'ever', 'here', 'positives', '-now', 'get', 'die', 'friends', '-you', 'get', 'murdered', "'friends", 'great', 'game', 'dorito+mtn', 'dew/9,000', 'p.s', 'you', 'also', 'haunted', 'friend', 'die', 'insanity', 'well', 'friend', 'die', "'s", 'ok'] 


['favorite', 'survival', 'game', 'i', 'almost', 'i', 'starved', 'death'] 


['easily', 'enjoyable', 'game', 'i', 'ever', 'played', 'each', 'time', 'start', 'new', 'world', 'make', 'slightly', 'last', 'time', 'die', 'new', 'aspect', 'never', 'knew', 'takes', 'several', 'times', 'get', 'mechanic', 'finally', 'start', 'learning', 'aspects', 'game', 'crafting', 'fighting', 'playing', 'friends', 'makes', 'game', '1000x', 'enjoyable', 'this', 'game', 'deal', 'breaker', 'i', 'see', 'potential', 'losing', 'friends', 'game', 'p', '11/10', 'would', 'recommend'] 


['great', 'group', 'kids', 'work', 'together', 'gives', 'something', 'ne