In [76]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.9.13-cp39-cp39-macosx_10_9_x86_64.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.9/293.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.7 regex-2022.9.13 tqdm-4.64.1


In [1]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate

In [2]:
df = pd.read_csv('../data/clean_data.csv')
df.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,user,rating,comment,ID,name
0,4108284,4108284,PegasusGamesNYC,2.0,Played it once and didn't like it,9209,Ticket to Ride
1,9412682,9412682,captaincomic,4.0,Several things to dislike: Player elimination;...,24068,Shadow Hunters
2,3448962,3448962,jblomquist,4.0,Seems like an ok game but it doesn't really wo...,150376,Dead of Winter: A Crossroads Game


In [3]:
df.shape

(100000, 7)

In [4]:
df = df[['rating','comment']]
df.head()

Unnamed: 0,rating,comment
0,2.0,Played it once and didn't like it
1,4.0,Several things to dislike: Player elimination;...
2,4.0,Seems like an ok game but it doesn't really wo...
3,3.0,Flux and Munchkin mixed but worse. Way way way...
4,3.0,The game open the player to be sieged by group...


In [5]:
df.isnull().sum()

rating     0
comment    0
dtype: int64

No nulls

In [6]:
df = df.drop_duplicates()

In [7]:
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0,rating,comment


In [8]:
df.isnull().sum()

rating     0
comment    0
dtype: int64

In [9]:
df.shape

(97885, 2)

In [10]:
df['rating'].value_counts()

4.0     24409
8.0     13559
3.0     12968
7.0     11863
6.0     10485
2.0      7353
9.0      5286
5.0      4354
1.0      4114
10.0     3492
0.0         2
Name: rating, dtype: int64

# Text preprocesing

In [11]:
# remove chars

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

# Apply to the DF series
df['comment_pp'] = df['comment'].apply(remove_punctuations)


In [12]:
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = text.rstrip()
    text = text.lstrip()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    lemmatized = [lemmatizer.lemmatize(word, pos = 'n') for word in text] # Lemmatize
    lemmatized = [lemmatizer.lemmatize(word, pos = 'v') for word in lemmatized] # Lemmatize
    lemmatized = [lemmatizer.lemmatize(word, pos = 'a') for word in lemmatized] # Lemmatize
    lemmatized = [lemmatizer.lemmatize(word, pos = 'r') for word in lemmatized] # Lemmatize
    words_only = [word for word in lemmatized if word.isalpha()]
    text = [word for word in words_only if not word in stop_words]
    text = " ".join(text)
    return text


In [13]:
df['comment_pp'] = df['comment_pp'].apply(lambda x: clean_text(x))

In [14]:
df

Unnamed: 0,rating,comment,comment_pp
0,2.0,Played it once and didn't like it,play didnt like
1,4.0,Several things to dislike: Player elimination;...,several thing dislike player elimination even ...
2,4.0,Seems like an ok game but it doesn't really wo...,seem like ok game doesnt really work group dyn...
3,3.0,Flux and Munchkin mixed but worse. Way way way...,flux munchkin mix bad way way way long
4,3.0,The game open the player to be sieged by group...,game open player sieged group ease king maker
...,...,...,...
99994,5.0,"At its heart, this is a Scrabble variant. I li...",heart scrabble variant like scrabble however p...
99995,5.0,as you get a little more familiar with it you ...,get little familiar realize make game good nee...
99996,8.0,"This is a fun, fast, and clever game. It feel...",fun fast clever game feel like tricktaking gam...
99998,10.0,"Nice one :) First off, i bought it to play sol...",nice one first buy play solitaire wife like ga...


In [15]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    
df_eng = df[df['comment_pp'].apply(isEnglish)]

In [16]:
df_eng

Unnamed: 0,rating,comment,comment_pp
0,2.0,Played it once and didn't like it,play didnt like
1,4.0,Several things to dislike: Player elimination;...,several thing dislike player elimination even ...
2,4.0,Seems like an ok game but it doesn't really wo...,seem like ok game doesnt really work group dyn...
3,3.0,Flux and Munchkin mixed but worse. Way way way...,flux munchkin mix bad way way way long
4,3.0,The game open the player to be sieged by group...,game open player sieged group ease king maker
...,...,...,...
99994,5.0,"At its heart, this is a Scrabble variant. I li...",heart scrabble variant like scrabble however p...
99995,5.0,as you get a little more familiar with it you ...,get little familiar realize make game good nee...
99996,8.0,"This is a fun, fast, and clever game. It feel...",fun fast clever game feel like tricktaking gam...
99998,10.0,"Nice one :) First off, i bought it to play sol...",nice one first buy play solitaire wife like ga...


In [17]:
labels1 = ['Bad','Average','High']
df_eng['y'] = pd.qcut(df['rating'],q=3, labels=labels1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eng['y'] = pd.qcut(df['rating'],q=3, labels=labels1)


In [18]:
df_eng

Unnamed: 0,rating,comment,comment_pp,y
0,2.0,Played it once and didn't like it,play didnt like,Bad
1,4.0,Several things to dislike: Player elimination;...,several thing dislike player elimination even ...,Bad
2,4.0,Seems like an ok game but it doesn't really wo...,seem like ok game doesnt really work group dyn...,Bad
3,3.0,Flux and Munchkin mixed but worse. Way way way...,flux munchkin mix bad way way way long,Bad
4,3.0,The game open the player to be sieged by group...,game open player sieged group ease king maker,Bad
...,...,...,...,...
99994,5.0,"At its heart, this is a Scrabble variant. I li...",heart scrabble variant like scrabble however p...,Average
99995,5.0,as you get a little more familiar with it you ...,get little familiar realize make game good nee...,Average
99996,8.0,"This is a fun, fast, and clever game. It feel...",fun fast clever game feel like tricktaking gam...,High
99998,10.0,"Nice one :) First off, i bought it to play sol...",nice one first buy play solitaire wife like ga...,High


In [19]:
df_eng.rating.value_counts()/len(df)

4.0     0.243091
8.0     0.133085
3.0     0.128763
7.0     0.116943
6.0     0.103468
2.0     0.073402
9.0     0.051887
5.0     0.043429
1.0     0.041079
10.0    0.034500
0.0     0.000020
Name: rating, dtype: float64

# Model

In [20]:
y = df_eng['y']
X = df_eng['comment_pp']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [22]:
X_train.size

75932

In [23]:
X_train

52819    really like game love bunch dice choose determ...
11071    blah euro one play end three way tie tie break...
95957                            game hate player let know
36155                            silly keep around coaster
2126     throw dice hope best someone ruin work random ...
                               ...                        
22347    meh awful mechanical one game go much like las...
77027                        fun party game play nongamers
52557    blind bid tile lay area majority single game s...
5359     read bgg definition see defy description game ...
81674    fantastic mystery solve game doe require right...
Name: comment_pp, Length: 75932, dtype: object

In [24]:
from sklearn.pipeline import make_pipeline

# Pipeline vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(TfidfVectorizer(), 
                                     MultinomialNB())

# Cross-validation
cv_results = cross_validate(pipeline_naive_bayes, X_train, y_train, cv = 5, scoring = ["accuracy"])
average_accuracy = cv_results["test_accuracy"].mean()
np.round(average_accuracy,2)

0.54

In [25]:
from sklearn.model_selection import GridSearchCV

# Define the grid of parameters
parameters = {
    'tfidfvectorizer__ngram_range': ((1,1), (2,2)),
    'multinomialnb__alpha': (0.25,0.05,0.1),}

# Perform Grid Search
grid_search = GridSearchCV(pipeline_naive_bayes,parameters, scoring = "accuracy",
                           cv = 5, n_jobs=-1, verbose=1)

grid_search.fit(X_train,y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Score = 0.5926882808222921
Best params = {'multinomialnb__alpha': 0.05, 'tfidfvectorizer__ngram_range': (1, 1)}


In [26]:
# vectorize
vectorizer = TfidfVectorizer(ngram_range=(1,1))

In [27]:
train_vectors = pd.DataFrame(vectorizer.fit_transform(X_train).toarray(),
                 columns = vectorizer.get_feature_names_out())

In [28]:
train_vectors

Unnamed: 0,aa,aaaaa,aaaaaaaaaaaaaaaaarrrrrrrrrgggggggggggggghhhhhhhhhhhhhhhhhhhhhh,aaaaaaaaaaaarrrrrrrrrrrrggggggggggggggghhhhhhhhhhhhhhhh,aaaaaaah,aaaaaaand,aaaaaages,aaaaaagggghhhhhhhh,aaaaaall,aaaaaand,...,zzzzz,zzzzzz,zzzzzzz,zzzzzzzz,zzzzzzzzz,zzzzzzzzzz,zzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# transform the test daya
test_vectors = vectorizer.transform(X_test)

In [None]:
model = MultinomialNB(alpha=0.05)
model.fit(train_vectors, y_train)

In [None]:
cv_results = cross_validate(model, train_vectors, y_train, cv = 5, scoring = ["accuracy"])
cv_results['test_accuracy'].mean()