# Initialisation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

OUTPUT_FOLDER = 'model/'

seed = 88
    


In [2]:
# Read the CSV file with specified column names
df = pd.read_csv("dataset/steam_dataset_200000.csv", index_col= 0)

def reduce_sample(df_sample, frac, random_state):
    df_sample = df_sample.sample(frac=frac, random_state=random_state)
    df_sample = df_sample.reset_index(drop=True)
    return df_sample
frac_pop = 1
df = reduce_sample(df, frac_pop, seed)

Columns in dataset

In [3]:
print("Columns in the original dataset:\n")
print(df.columns)

Columns in the original dataset:

Index(['app_id', 'app_name', 'review_text', 'review_score', 'review_votes'], dtype='object')


Example of a Row in dataset

In [4]:
df.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,322330,Don't Starve Together,Early Access Review,1,0
1,218620,PAYDAY 2,"PAYDAY 2 is an action-packed, four-player co-o...",1,0
2,372800,RPG MO,Early Access Review,1,0
3,453090,Parkitect,Early Access Review,1,0
4,236390,War Thunder,This is my first diccusion and hope it don't h...,-1,0


In [5]:
df['review_score'].value_counts()

review_score
 1    100000
-1    100000
Name: count, dtype: int64

In [6]:
df.describe()

Unnamed: 0,app_id,review_score,review_votes
count,200000.0,200000.0,200000.0
mean,235172.063805,0.0,0.18333
std,123389.931619,1.000003,0.386938
min,10.0,-1.0,0.0
25%,206420.0,-1.0,0.0
50%,244210.0,0.0,0.0
75%,311240.0,1.0,0.0
max,562710.0,1.0,1.0


Clean data

In [7]:
def clean_dataset(df_to_clean, drop_columns):
    df_to_clean = df_to_clean.dropna()
    df_to_clean = df_to_clean.drop_duplicates()
    df_to_clean = df_to_clean.drop(columns=drop_columns)
    df_to_clean = df_to_clean.reset_index(drop=True)
    df_to_clean.describe()
    return df_to_clean

df_cleaned = clean_dataset(df, ["app_id", "app_name", "review_votes"])

In [8]:
df_cleaned.head()

Unnamed: 0,review_text,review_score
0,Early Access Review,1
1,"PAYDAY 2 is an action-packed, four-player co-o...",1
2,Early Access Review,1
3,Early Access Review,1
4,This is my first diccusion and hope it don't h...,-1


## Tokenisation

In [9]:
def tokenize_text(df_to_tokenise, text_column, tokenized_text_column):
    from gensim.utils import simple_preprocess
    import string
    df_to_tokenise[text_column] = ["".join([(ch if ch not in string.punctuation else " ") for ch in line]) for line in df_to_tokenise[text_column]]
    # Tokenize the text column to get the new column 'tokenized_text'
    df_to_tokenise[tokenized_text_column] = [simple_preprocess(line, deacc=True) for line in df_to_tokenise[text_column]]
    return df_to_tokenise
    
df_cleaned = tokenize_text(df_cleaned, 'review_text', 'tokenized_text')
df_cleaned['tokenized_text'].head()

0                              [early, access, review]
1    [payday, is, an, action, packed, four, player,...
2                              [early, access, review]
3                              [early, access, review]
4    [this, is, my, first, diccusion, and, hope, it...
Name: tokenized_text, dtype: object

# Stemming & Lemma

In [10]:
df_to_be_stemmed = df_cleaned.copy()

### PoterStammer

In [11]:
def porter_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from gensim.parsing.porter import PorterStemmer
    porter_stemmer = PorterStemmer()
    df_potter_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_potter_stemmed[stemmed_text_column] = [[porter_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_potter_stemmed[token_text_column]]  
    return df_potter_stemmed



### Lancaster

In [12]:
def lancaster_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from nltk.stem.lancaster import LancasterStemmer
    lancaster_stemmer = LancasterStemmer()
    df_lancaster_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_lancaster_stemmed[stemmed_text_column] = [[lancaster_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_lancaster_stemmed[token_text_column]]
    
    return df_lancaster_stemmed


### Snowball

In [13]:
def snowball_stemmer_on_text(df_to_be_stemmed, token_text_column, stemmed_text_column):
    from nltk.stem.snowball import EnglishStemmer
    snowball_stemmer = EnglishStemmer()
    df_snowball_stemmed = df_to_be_stemmed.copy()
    # Get the stemmed_tokens
    df_snowball_stemmed[stemmed_text_column] = [[snowball_stemmer.stem(word) for word in tokens] 
                                        for tokens in df_snowball_stemmed[token_text_column]]
    
    return df_snowball_stemmed


### Lemmatisation

In [14]:
def lemmatize_text(df_to_be_stemmed, token_text_column, lemmatized_text_column):
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    df_lemmatized = df_to_be_stemmed.copy()
    
    # Get the lemmatized_tokens
    df_lemmatized[lemmatized_text_column] = [[wordnet_lemmatizer.lemmatize(word) for word in tokens] 
                                          for tokens in df_lemmatized[token_text_column]]
    
    return df_lemmatized


In [15]:
df_potter_stemmed = porter_stemmer_on_text(df_to_be_stemmed, 'tokenized_text', 'stemmed_text')
df_potter_stemmed.head(10)

Unnamed: 0,review_text,review_score,tokenized_text,stemmed_text
0,Early Access Review,1,"[early, access, review]","[earli, access, review]"
1,PAYDAY 2 is an action packed four player co o...,1,"[payday, is, an, action, packed, four, player,...","[paydai, is, an, action, pack, four, player, c..."
2,Early Access Review,1,"[early, access, review]","[earli, access, review]"
3,Early Access Review,1,"[early, access, review]","[earli, access, review]"
4,This is my first diccusion and hope it don t h...,-1,"[this, is, my, first, diccusion, and, hope, it...","[thi, is, my, first, diccus, and, hope, it, do..."
5,Early Access Review,1,"[early, access, review]","[earli, access, review]"
6,Uplay More like Udont,-1,"[uplay, more, like, udont]","[uplai, more, like, udont]"
7,This game doesn t even start Don t bother wit...,-1,"[this, game, doesn, even, start, don, bother, ...","[thi, game, doesn, even, start, don, bother, w..."
8,Early Access Review,-1,"[early, access, review]","[earli, access, review]"
9,Its like someone played Guacamelee and made an...,-1,"[its, like, someone, played, guacamelee, and, ...","[it, like, someon, plai, guacamele, and, made,..."


## Split into Train and Test Sets

- Train data ( Subset of data for training ML Model) ~70%
- Test data (Subset of data for testing ML Model trained from the train data)

In [16]:
from sklearn.model_selection import train_test_split

def split_train_test(data, sentiment_value_col, tokenised_text_col, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split( data[tokenised_text_col],
                                                        data[sentiment_value_col], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(df_potter_stemmed, 'review_score', 'stemmed_text')

Value counts for Train sentiments
review_score
 1    58126
-1    54644
Name: count, dtype: int64
Value counts for Test sentiments
review_score
 1    24887
-1    23444
Name: count, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
    index                                       stemmed_text
0   79666  [quit, possibl, the, best, propaganda, in, the...
1   39512  [best, game, ever, for, real, am, pretti, sure...
2   27798                                        [top, game]
3   68251                                    [steam, econom]
4  110354  [back, in, dai, decid, to, bui, lego, game, to...


# Word2Vec 

## Save-gram approach

### Generate model

vector_size (int, optional) – Dimensionality of the word vectors.

window (int, optional) – Maximum distance between the current and predicted word within a sentence.

min_count (int, optional) – Ignores all words with total frequency lower than this.

workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).

sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.

In [17]:
def generate_word2vec_model(stemmed_df, file_name_code, stem_column_name, sg, vector_size, window, min_count, workers):
    from gensim.models import Word2Vec
    # Skip-gram model (sg = 1)
    filename = f"{file_name_code}_wind_{window}_min_{min_count}_workers_{workers}.wordvectors"
    stemmed_tokens = pd.Series(stemmed_df[stem_column_name]).values
    # Train the Word2Vec Model
    w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = vector_size, workers = workers, window = window, sg = sg, cbow_mean = 1)
    w2v_model_wv_fn = w2v_model.wv
    w2v_model_wv_fn.save(OUTPUT_FOLDER + filename)

    return w2v_model_wv_fn, OUTPUT_FOLDER + filename

vector_size = 100
sg = 1
file_name_code = f"_cs3244_steam_vec_sz_{vector_size}_sg_{sg}_frac_pop_{frac_pop}"
OUTPUT_FOLDER = 'model/' + file_name_code + '/'
import os
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)


# print(w2v_model_cbow_wv.most_similar('good'))
# print(w2v_model_sg_wv.most_similar('good'))


### Load model

In [18]:
from gensim.models import KeyedVectors
# Load the model from the model file

def load_word2vec_model(reduced_mode_file, stem_column_name):
    sg_w2v_model_wv = KeyedVectors.load(reduced_mode_file)
    # # Unique ID of the word
    # print("Index of the word 'action':")
    # print(sg_w2v_model_wv.key_to_index["action"])
    # # Total number of the words
    # print(len(sg_w2v_model_wv.key_to_index))
    # # Print the size of the word2vec vector for one word
    # print("Length of the vector generated for a word")
    # print(len(sg_w2v_model_wv['action']))
    # # Get the mean for the vectors for an example review
    # print("Print the length after taking average of all word vectors in a sentence:")
    # print(np.mean([sg_w2v_model_wv[token] for token in df_potter_stemmed[stem_column_name][0]], axis=0))
    return sg_w2v_model_wv
# output_name = OUTPUT_FOLDER + f"cs3244_steam_vec_sz_{vector_size}_sg_{sg}_frac_pop_{frac_pop}.wordvectors"
# w2v_model_wv = load_word2vec_model(output_name, 'stemmed_text')

In [21]:
def save_word2vec_to_csv(X_set, sg_w2v_model_wv, stem_col, word2vec_filename):
    with open(word2vec_filename, 'w+') as word2vec_file:
        for index, row in X_set.iterrows():
            v_norm = (np.mean([sg_w2v_model_wv[token] for token in row[stem_col]], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(vector_size))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            # Check if the line exists else it is vector of zeros
            if type(v_norm) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in v_norm] )
            else:
                line1 = ",".join([str(0) for i in range(vector_size)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
            
def save_word2vec_to_csv_y(Y_set, sentiment_col, word2vec_filename):
    with open(word2vec_filename, 'w+') as word2vec_file:
        for index, row in Y_set.iterrows():
            if index == 0:
                header = "review_score"
                word2vec_file.write(header)
                word2vec_file.write("\n")
            word2vec_file.write(str(row[sentiment_col]))
            word2vec_file.write('\n')

train_Y_word2vec_filename = OUTPUT_FOLDER +  f"train_Y_{file_name_code}.csv"
test_Y_word2vec_filename = OUTPUT_FOLDER + f"test_Y_{file_name_code}.csv"

save_word2vec_to_csv_y(Y_train, 'review_score', train_Y_word2vec_filename)
save_word2vec_to_csv_y(Y_test, 'review_score', test_Y_word2vec_filename)

# train_X_word2vec_filename = OUTPUT_FOLDER +  f"train_X_{file_name_code}.csv"
# test_X_word2vec_filename = OUTPUT_FOLDER + f"test_X_{file_name_code}.csv"
# 
# 
# save_word2vec_to_csv(X_train, w2v_model_wv, "stemmed_text", train_X_word2vec_filename)
# save_word2vec_to_csv(X_test, w2v_model_wv, "stemmed_text", test_X_word2vec_filename)

# save_word2vec_to_csv(X_train, w2v_model_cbow_wv, "stemmed_text", train_X_word2vec_cbow_filename)
# save_word2vec_to_csv(X_test, w2v_model_cbow_wv, "stemmed_text", test_X_word2vec_cbow_filename)
