In [1]:
import os.path
from pathlib import Path
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
def initialize_preprocessed_data():
    """Create an empty csv file preprocessed data

    Args:
        None
        
    Returns:
        None
    """
    
    file = Path('data/preprocessed_review_data.csv')
    
    if (file.exists()):
        print("preprocessed data already exists")
    else:
        column_names = ['steam_id', 'review', 'timestamp_created', 'voted_up']

        preprocessed_data = pd.DataFrame(columns = column_names)
        preprocessed_data.to_csv('data/preprocessed_review_data.csv', index=False)
        print("preprocessed data initialized")
    
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [t for t in tokens if t not in stop_words]
    return filtered_tokens

def remove_punctuation(tokens): # PROBLEMO
    filtered_tokens = [t for t in tokens if t not in string.punctuation and t != "..."]
    return filtered_tokens

def preprocess_review(review):
    """Preprocess user reviews by tokenizing them, removing stopwords, punctuation 
    along with converting the text to lowercase
    
    Args: 
        review (str): the user review to be processed
    
    Returns:
        processed_tokens (array): contains the processed review
    
    """
    
    review_tokens = word_tokenize(review.lower(), language='english', preserve_line=False)
    filtered_tokens = remove_stopwords(review_tokens)
    preprocessed_tokens = remove_punctuation(filtered_tokens)
    return preprocessed_tokens

def preprocess(data):
    preprocessed_data = data.copy()
    preprocessed_data['review'] = preprocessed_data['review'].astype(str).apply(preprocess_review)
    return preprocessed_data


def update_preprocessed_data():
    """Create a backup of the preprocessed data and update it 
    
    Args:
        None
         
    Returns:
        None
    """
    
    preprocessed_data = pd.read_csv('data/preprocessed_review_data.csv')
    preprocessed_shape = preprocessed_data.shape[0]
    
    cleansed_data = pd.read_csv('data/review_data.csv')
    cleansed_shape = cleansed_data.shape[0]
    
        
    if (preprocessed_shape == cleansed_shape):
        print("no new data to preprocess")
        return None
    else:
        preprocessed_data.to_csv('data/backup_data/preprocessed_review_data_backup.csv', index=False)

        new_preprocessed_data = preprocess(cleansed_data[preprocessed_shape:])
        updated_data = preprocessed_data.append(new_preprocessed_data, ignore_index=True)
        updated_data.to_csv('data/preprocessed_review_data.csv', index=False)

def display_reviews(data):
    for row in range(data.shape[0]):
        print(data.loc[row, 'review'], '\n')

In [3]:
initialize_preprocessed_data()

preprocessed data already exists


In [4]:
update_preprocessed_data()

In [5]:
# skip processing and view a sample of preprocessed data
preprocessed_reviews = pd.read_csv('data/preprocessed_review_data.csv')

display(preprocessed_reviews.head())
display_reviews(preprocessed_reviews.head())

Unnamed: 0,steam_id,review,timestamp_created,voted_up
0,76561198066184692,"['survived', 'deerclop', 'raid', 'killed', 'ra...",1434424743,True
1,76561198101781146,"[""n't"", 'starve', 'back', 'fun', 'ever', 'posi...",1434337736,True
2,76561198041162914,"['favorite', 'survival', 'game', 'almost', 'st...",1466010582,True
3,76561198056008417,"['easily', 'enjoyable', 'game', 'ever', 'playe...",1452657532,True
4,76561198143487092,"['great', 'group', 'kids', 'work', 'together',...",1467918656,True


['survived', 'deerclop', 'raid', 'killed', 'random', 'frog', '10frograin/10'] 

["n't", 'starve', 'back', 'fun', 'ever', 'positives', '-now', 'get', 'die', 'friends', '-you', 'get', 'murdered', "'friends", 'great', 'game', 'dorito+mtn', 'dew/9,000', 'p.s', 'also', 'haunted', 'friend', 'die', 'insanity', 'well', 'friend', 'die', "'s", 'ok'] 

['favorite', 'survival', 'game', 'almost', 'starved', 'death'] 

['easily', 'enjoyable', 'game', 'ever', 'played', 'time', 'start', 'new', 'world', 'make', 'slightly', 'last', 'time', 'die', 'new', 'aspect', 'never', 'knew', 'takes', 'several', 'times', 'get', 'mechanic', 'finally', 'start', 'learning', 'aspects', 'game', 'crafting', 'fighting', 'playing', 'friends', 'makes', 'game', '1000x', 'enjoyable', 'game', 'deal', 'breaker', 'see', 'potential', 'losing', 'friends', 'game', 'p', '11/10', 'would', 'recommend'] 

['great', 'group', 'kids', 'work', 'together', 'gives', 'something', 'new', 'besides', 'minecraft', 'fun', 'whole', 'family'] 

