In [1]:
import pandas as pd
data = pd.read_csv("game_review.csv")

In [2]:
features = data["comment"]
target = data["sentiment"]

In [20]:
'''
Unigrams, Bigrams, Trigrams, 4-Grams & 5-Grams
'''
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=3000)

for i in range(5):
    for j in range(3):

        #create the vocabulary based on the training data
        vect = TfidfVectorizer(min_df = j + 4, ngram_range = (1,(i+1))).fit(X_train)

        #encode the words in X_train and X_test based on the vocabulary
        X_train_vectorized = vect.transform(X_train)
        X_test_vectorized = vect.transform(X_test)

        #train the classifier
        model = MultinomialNB(alpha = 0.5).fit(X=X_train_vectorized, y = y_train)

        print('Includes phrases length of:', [i + 1 for i in range(i + 1)])
        print('Min_df:', j + 4)
        print("Classification accuracy on training set: ", model.score(X_train_vectorized, y_train))
        print("Classification accuracy on testing set: ", model.score(X_test_vectorized, y_test))
        print("Number of features used: ", len(vect.get_feature_names()), '\n')

Includes phrases length of: [1]
Min_df: 4
Classification accuracy on training set:  0.8811618489037844
Classification accuracy on testing set:  0.8
Number of features used:  10198
Includes phrases length of: [1]
Min_df: 5
Classification accuracy on training set:  0.875366568914956
Classification accuracy on testing set:  0.802303664921466
Number of features used:  8369
Includes phrases length of: [1]
Min_df: 6
Classification accuracy on training set:  0.8699204021784667
Classification accuracy on testing set:  0.806282722513089
Number of features used:  7212
Includes phrases length of: [1, 2]
Min_df: 4
Classification accuracy on training set:  0.9252897639994414
Classification accuracy on testing set:  0.829738219895288
Number of features used:  36069
Includes phrases length of: [1, 2]
Min_df: 5
Classification accuracy on training set:  0.9162826420890937
Classification accuracy on testing set:  0.8272251308900523
Number of features used:  28068
Includes phrases length of: [1, 2]
Min_d

In [None]:
'''
The combination that yielded the best result was using unigrams, bigrams, and trigrams with 
a minimum number of document appearances of 5

Includes phrases length of: [1, 2, 3]
Min_df: 5
Classification accuracy on training set:  0.920262533165759
Classification accuracy on testing set:  0.8312041884816754
Number of features used:  37412

As the number of n_grams increased, there was more overfitting as the model performed far better
on the training set and worse on the testing sets.

The effect of slightly increasing the min_df had a very small impact on the training set accuracy
'''