In [1]:
# Study: Natural Language Processing with Deep Learning
# Dataset: Dead By Daylight Steam Reviews
# Author: Willian Oliveira and Julierme Silva
# Start: 09/04/2023
# Study Motivation: Train a machine to classify products based on user reviews
# Notebook Motivation: The purpose of this notebook is to perform data cleaning and pre-processing on Dead By Daylight game reviews collected from the Steam platform.
# Study Status: In Progress

In [2]:
# Importing libraries and setting up the environment

import pandas as pd
import numpy as np
import os
import random
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from IPython.display import Markdown, display

nltk.download('stopwords')
nltk.download('wordnet')

SEED = 0

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds() # Setting seed for reproducible code

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Creating useful functions for this notebook

def preprocess_review(review):
    """
    This function takes a raw review text as input and performs the following preprocessing steps:
    1. Remove special characters and lowercase the text
    2. Tokenize the text
    3. Remove stopwords
    4. Lemmatize the words
    5. Reunite the words back to string
    
    :param review: str
    :return: clean_review: str
    """
    review = re.sub('[^a-zA-Z]', ' ', review).lower()
    words = nltk.word_tokenize(review)
    stopwords_list = set(stopwords.words('english'))
    words = [word for word in words if word not in stopwords_list]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    clean_review = ' '.join(words)

    return clean_review

def display_missing_values(df):
    """
    This function calculates the percentage of missing values in each column of a
    Pandas DataFrame and displays the results in descending order. 

    :return: None
    """

    missing_values = df.isnull().sum() / len(df) * 100
    missing_values = missing_values.sort_values(ascending=False)
    missing_values.rename("% Missing Values", inplace=True)
    display(Markdown(missing_values.to_markdown()))
    del missing_values

def unique_values_table(df, uv=3):
    """
    Print a markdown table
    with the col, the number of unique values and the unique values 
    list if there are less than 3 unique values (uv) by defalt.

    :param uv: int
    :return: None
    """
    from IPython.display import display, Markdown
    md_table_str = '|Column Name|Unique Values||\n|---|---|---|\n'
    for col_name, unique_values in df.nunique().items():
        if unique_values > uv:
            md_table_str += '|{}|{}|\n'.format(col_name, unique_values)
        else:
            md_unique_str = ' '.join([
                f'{name}: {value*100:.2f}\%'
                for name, value in 
                df[col_name].value_counts(normalize=True).items()
            ])

            md_table_str += '|{}|{}|{}\n'.format(
                col_name, unique_values, md_unique_str)
    display(Markdown(md_table_str))

In [4]:
# Loading the dataset

df = pd.read_csv('data/raw/dbd_english_reviews.csv')
df.head()

Unnamed: 0,review_id,review_text,recommended
0,136024101,good game\n,True
1,136022433,BHVR implemented dc penalty cos they apparentl...,False
2,136022116,เกมหมาๆ คิลเก่งก็เก่งไปเลย กากก็กากสัส กดโซโล่...,False
3,136019421,This game is very fun unless you get fucked by...,True
4,136019209,Why?,False


In [5]:
# Checking basic information about the dataset

display(f'Reviews on DataFrame {len(df)}')
display_missing_values(df)
unique_values_table(df)

'Reviews on DataFrame 173648'

|             |   % Missing Values |
|:------------|-------------------:|
| review_text |           0.237262 |
| review_id   |           0        |
| recommended |           0        |

|Column Name|Unique Values||
|---|---|---|
|review_id|173648|
|review_text|140299|
|recommended|2|True: 81.03\% False: 18.97\%


In [6]:
# Drop reviews with missing values

df.dropna(subset=['review_text'], inplace=True)

In [7]:
# Pre-processing the review texts

df['preproc_review_text'] = df['review_text'].apply(preprocess_review)
df.head()

Unnamed: 0,review_id,review_text,recommended,preproc_review_text
0,136024101,good game\n,True,good game
1,136022433,BHVR implemented dc penalty cos they apparentl...,False,bhvr implemented dc penalty co apparently dont...
2,136022116,เกมหมาๆ คิลเก่งก็เก่งไปเลย กากก็กากสัส กดโซโล่...,False,
3,136019421,This game is very fun unless you get fucked by...,True,game fun unless get fucked michael myers fun a...
4,136019209,Why?,False,


In [8]:
# Splitting the dataset into train, validation and test sets

# Train and Temp (Test + Validation)

X = df['preproc_review_text']
y = df['recommended']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=SEED)

# Splitting the temp set into test and validation sets

X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED)

# Verify the size of each set

print(f'Train set: {len(X_train)}, {len(y_train)}')
print(f'Validation set: {len(X_val)}, {len(y_val)}')
print(f'Test set: {len(X_test)}, {len(y_test)}')

Train set: 138588, 138588
Validation set: 17324, 17324
Test set: 17324, 17324


In [9]:
# Exporting the pre-processed splits to npz files

np.savez_compressed('data/processed/dbd_proc_train.npz', X_train=X_train, y_train=y_train)
np.savez_compressed('data/processed/dbd_proc_val.npz', X_val=X_val, y_val=y_val)
np.savez_compressed('data/processed/dbd_proc_test.npz', X_test=X_test, y_test=y_test)
