In [1]:
from ipynb.fs.full.data_loader import load_data
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sys import maxsize

In [2]:
df = load_data()

In [3]:
df_filtered = df[
    df["starRating"].notna()
]

In [None]:
accuracies = []

kf = KFold(n_splits=5, shuffle=True, random_state=1731)
for train_indices, test_indices in kf.split(df_filtered):
    df_train = df_filtered.iloc[train_indices, :]
    df_test = df_filtered.iloc[test_indices, :]
    
    vectorizer = CountVectorizer(stop_words=None, ngram_range=(1, 1), min_df=1)
    X_train = vectorizer.fit_transform(df_train["bodyText"])
    X_test = vectorizer.transform(df_test["bodyText"])
    
    y_train = df_train["starRating"].astype(int)
    y_test = df_test["starRating"].astype(int)
    
    classifier = RandomForestClassifier(
        n_estimators=100, criterion="gini", n_jobs=-1, random_state=1731, verbose=maxsize, class_weight=None
    )
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

In [22]:
accuracies

[0.4997238222993766,
 0.4993292827270575,
 0.48903093434343436,
 0.5056029040404041,
 0.4895833333333333]

In [4]:
mlp_accuracies = []

kf = KFold(n_splits=5, shuffle=True, random_state=1731)
for train_indices, test_indices in kf.split(df_filtered):
    df_train = df_filtered.iloc[train_indices, :]
    df_test = df_filtered.iloc[test_indices, :]
    
    vectorizer = CountVectorizer(stop_words=None, ngram_range=(1, 1), min_df=1)
    X_train = vectorizer.fit_transform(df_train["bodyText"])
    X_test = vectorizer.transform(df_test["bodyText"])
    
    y_train = df_train["starRating"].astype(int)
    y_test = df_test["starRating"].astype(int)
    
    classifier = MLPClassifier(random_state=1731, verbose=True)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    mlp_accuracies.append(accuracy)

Iteration 1, loss = 1.08057698
Iteration 2, loss = 0.55313937
Iteration 3, loss = 0.27400285
Iteration 4, loss = 0.13132005
Iteration 5, loss = 0.06009351
Iteration 6, loss = 0.03443100
Iteration 7, loss = 0.02449263
Iteration 8, loss = 0.02498031
Iteration 9, loss = 0.02757077
Iteration 10, loss = 0.02541079
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
Iteration 1, loss = 1.06117998
Iteration 2, loss = 0.56193173
Iteration 3, loss = 0.28279077
Iteration 4, loss = 0.12037070
Iteration 5, loss = 0.04991012
Iteration 6, loss = 0.02983878
Iteration 7, loss = 0.02346298
Iteration 8, loss = 0.02203106
Iteration 9, loss = 0.02347802
Iteration 10, loss = 0.02170192
Iteration 11, loss = 0.02545230
Iteration 12, loss = 0.03853238
Iteration 13, loss = 0.03648917
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
Iteration 1, loss = 1.05590665
Iteration 2, loss = 0.54832545
Iteration 3, loss = 0.25931748
Iterati

In [5]:
mlp_accuracies

[0.5432020831689418,
 0.5455693206028565,
 0.5272253787878788,
 0.5403251262626263,
 0.5294349747474747]