In [1]:
# Study: Natural Language Processing with Deep Learning
# Dataset: Dead By Daylight Steam Reviews
# Author: Willian Oliveira and Julierme Silva
# Start: 10/04/2023
# Study Motivation: Train a machine to classify products based on user reviews
# Notebook Motivation: The purpose of this notebook is to train a Support Vector Machine model to classify the reviews using word2vec.
# Study Status: In Progress

In [2]:
# Importing the libraries and setting up the environment

import os
import random
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

SEED = 0

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds() # Setting seed for reproducible code

In [3]:
# Creating useful functions for this notebook

def review_to_vector(review, model):
    """
    This function takes a preprocessed review text as input and performs the following steps:
    1. Split the review text into words
    2. Retrieve word vectors for each word from the trained Word2Vec model
    3. If no word vectors are found, return a zero vector with the same dimensions as the Word2Vec model's vectors
    4. Otherwise, return the mean of the word vectors as a single vector representation of the review
    
    :param review: str
    :return: review_vector: numpy.ndarray
    """
    words = review.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv.key_to_index]
    
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)

In [4]:
# Importing the dataset

X_train = np.load('data\processed\dbd_proc_train.npz', allow_pickle=True)['X_train']
y_train = np.load('data\processed\dbd_proc_train.npz', allow_pickle=True)['y_train']
X_val = np.load('data\processed\dbd_proc_val.npz', allow_pickle=True)['X_val']
y_val = np.load('data\processed\dbd_proc_val.npz', allow_pickle=True)['y_val']
X_test = np.load('data\processed\dbd_proc_test.npz', allow_pickle=True)['X_test']
y_test = np.load('data\processed\dbd_proc_test.npz', allow_pickle=True)['y_test']


In [5]:
# Verifying the shape of the dataset

print('X_train shape: ', X_train.shape, 'y_train shape: ', y_train.shape)
print('X_val shape: ', X_val.shape, 'y_val shape: ', y_val.shape)
print('X_test shape: ', X_test.shape, 'y_test shape: ', y_test.shape)

X_train shape:  (138588,) y_train shape:  (138588,)
X_val shape:  (17324,) y_val shape:  (17324,)
X_test shape:  (17324,) y_test shape:  (17324,)


In [6]:
# Verifying the first 5 reviews and their labels

for i in range(5):
    print('Review: ', X_train[i])
    print('Label:  ', y_train[i],'\n')


Review:  played many game life none left feeling horrible large chunk playerbase play torment others way transcending game anything match sole purpose making miserable throw insult mock match abide rule play nice likely loose flame anyway survivor killer player stuck gameplay loop inherently unfun side higher level play optimal also boring frustrating everyone many perk greatly unbalanced dominating game killer feel absolutely powerless helpless survivor dance front mock survivor get targetted frustrated killer tunneled game early vicious cycle hatred u v mentality frustration many people call toxic wear badge honour acting accordingly devs history acknowledging state community leaving feature exploit game specifically bully player nothing actual gameplay even told people issue play another game one past stream therefore unlikely ever get better issue extends accessibility issue macroes people use purely annoy others cause rapidly flashing light persisting drilling sound may cause seve

In [7]:
# Training the Word2Vec model

tokenized_train_reviews = [review.split() for review in X_train]
word2vec_model = Word2Vec(tokenized_train_reviews, window=5, min_count=2, workers=4, sg=1)

In [8]:
# Vectorizing all preprocessed datasets

X_train_vectors = np.vstack([review_to_vector(review, word2vec_model) for review in X_train])
X_test_vectors = np.vstack([review_to_vector(review, word2vec_model) for review in X_test])
X_val_vectors = np.vstack([review_to_vector(review, word2vec_model) for review in X_val])

In [9]:
# Verifying the first 5 vectorized reviews and their labels

for i in range(5):
    print(f'Vectorized Review: {X_train_vectors[i]} -> Label {y_train[i]}.\n')

Vectorized Review: [-0.11937132  0.02813475  0.02784893 -0.03650415  0.07002902 -0.3202045
  0.28634495  0.56452775 -0.23548026 -0.34144157  0.02482489 -0.26737189
  0.07036278  0.11755116  0.18514377 -0.15203267  0.20474285 -0.23744814
 -0.00765318 -0.37633148  0.17365935  0.11473239  0.10359175 -0.05636572
  0.00326007 -0.06139879 -0.2322841   0.05987832 -0.20943126 -0.05327559
  0.29092532  0.03262205  0.14835322 -0.30664673 -0.06608938  0.28154269
  0.09727833 -0.09791467 -0.0960035  -0.16280629  0.10677461 -0.1471927
 -0.06468121  0.05507277  0.22330019 -0.20338844 -0.10090797 -0.00578005
  0.19479306  0.16803268  0.09098385 -0.1530385  -0.03914877 -0.02920726
 -0.00336284  0.09817076  0.18606591  0.07089808 -0.07527822  0.18075196
  0.09734793 -0.02270308  0.05672899  0.06289259 -0.26193926  0.26059395
  0.07441698  0.3301622  -0.26471892  0.22633351  0.09501598  0.1125366
  0.23544538  0.05687068  0.3217434   0.09747277  0.10794196  0.02190048
 -0.02241277  0.04657407 -0.2082409