In [1]:
import pandas as pd
import re
import nltk
import pickle

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Toshiba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Toshiba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Toshiba\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Toshiba\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
df = pd.read_csv("../dataset/ROBLOX_REVIEWS.csv")

df = df[['review_text']]
df.dropna(inplace=True)

df.head()


Unnamed: 0,review_text
0,Because to lag still
1,Best game ever
2,It's a good game and i want dough in blox fruit
3,Is the best game i ever had
4,Wow this game is very nice ☺️ my grand daughte...


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_text'] = df['review_text'].apply(clean_text)


In [5]:
df['tokens'] = df['clean_text'].apply(word_tokenize)
df[['clean_text', 'tokens']].head()


Unnamed: 0,clean_text,tokens
0,because to lag still,"[because, to, lag, still]"
1,best game ever,"[best, game, ever]"
2,its a good game and i want dough in blox fruit,"[its, a, good, game, and, i, want, dough, in, ..."
3,is the best game i ever had,"[is, the, best, game, i, ever, had]"
4,wow this game is very nice my grand daughter e...,"[wow, this, game, is, very, nice, my, grand, d..."


In [12]:
stop_words = set(stopwords.words('english'))

custom_stopwords = {
    'app', 'game', 'games',
    'play', 'playing', 'played',
    'please', 'pls', 'cant', 'dont',
    'im', 'ive', 'sooo'
}

stop_words = stop_words.union(custom_stopwords)

df['tokens'] = df['tokens'].apply(
    lambda tokens: [t for t in tokens if t not in stop_words and len(t) > 2]
)


In [7]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Toshiba\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [8]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


In [9]:
lemmatizer = WordNetLemmatizer()

def lemmatize_with_pos(tokens):
    pos_tags = pos_tag(tokens)
    return [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]

df['tokens'] = df['tokens'].apply(lemmatize_with_pos)


In [10]:
df['clean_review'] = df['tokens']
df[['review_text', 'clean_review']].head()


Unnamed: 0,review_text,clean_review
0,Because to lag still,"[lag, still]"
1,Best game ever,"[best, ever]"
2,It's a good game and i want dough in blox fruit,"[good, want, dough, blox, fruit]"
3,Is the best game i ever had,"[best, ever]"
4,Wow this game is very nice ☺️ my grand daughte...,"[wow, nice, grand, daughter, enjoy, take, robux]"


In [11]:
with open("../dataset/roblox_clean_text.pkl", "wb") as f:
    pickle.dump(df['clean_review'].tolist(), f)
