In [1]:
import data_cleaning as dc
import review_score_analysis as rs
import text_analysis as ml

cuML: Installed accelerator for sklearn.
cuML: Successfully initialized accelerator.


In [2]:
# Load dataframes
chunk_size = 100_000
restaurants_df = dc.load("data/filtered_restaurants.json", chunk_size)
reviews_df = dc.load("data/filtered_reviews.json", chunk_size)
avg_scores_df = rs.calculate_average_review_score(reviews_df)

In [None]:
# Sandbox code
avg_scores_df['stars'].value_counts()


stars
5    1955603
4    1058020
3     503164
1     484827
2     369634
Name: count, dtype: int64

In [3]:
# Train and save large 2 label classifier
train_X, train_y, binary_tfidf, binary_classifier = ml.train_binary_model(avg_scores_df, 50_000)
ml.save_model(train_X, train_y, binary_tfidf, binary_classifier, "binary")

SVC with the linear kernel can be much faster using the specialized solver provided by LinearSVC. Consider switching to LinearSVC if tranining takes too long.


In [5]:
# Load trained model
train_X, train_, binary_tfidf, binary_classifier = ml.load_model("binary")

# Evaluate classifier with test data (using data points unused in training)
test_X, test_y = ml.create_binary_test_data(reviews_df, 5_000, binary_tfidf)
ml.benchmark(test_X, test_y)
ml.evaluate_classifier(binary_classifier, test_X, test_y)

Benchmark accuracy for our model to beat: 0.6950609878024395
Model Accuracy: 0.9000199960007998


In [5]:
# Cross validate kernel for binary classification
best_kernel = ml.binary_kernel_cross_validation(train_X, train_y)

print(f'the {best_kernel} kernel gives the highest accuracy')

Accuracy of linear kernel on split 0: 0.8976440942362306
Accuracy of linear kernel on split 1: 0.89516
Accuracy of linear kernel on split 2: 0.89684
Accuracy of linear kernel on split 3: 0.89336
Accuracy of rbf kernel on split 0: 0.8984840606375745
Accuracy of rbf kernel on split 1: 0.89776
Accuracy of rbf kernel on split 2: 0.90084
Accuracy of rbf kernel on split 3: 0.8974
Accuracy of poly kernel on split 0: 0.8250069997200112
Accuracy of poly kernel on split 1: 0.81712
Accuracy of poly kernel on split 2: 0.82616
Accuracy of poly kernel on split 3: 0.82536
Accuracy of sigmoid kernel on split 0: 0.8974441022359105
Accuracy of sigmoid kernel on split 1: 0.89464
Accuracy of sigmoid kernel on split 2: 0.89636
Accuracy of sigmoid kernel on split 3: 0.89344
the rbf kernel gives the highest accuracy


In [6]:
# Train and save 3-label classifier
# GPU acceleration doesn't work on multiclass SVM, so we use less datapoints
train_X, train_y, three_tfidf, three_classifier = ml.train_3_class_model(avg_scores_df, 50_000)
ml.save_model(train_X, train_y, three_tfidf, three_classifier, "3_class")

In [7]:
# Benchmark 3-way classifier
train_X, train_y, three_tfidf, three_classifier = ml.load_model("3_class")

# Evaluate classifier with test data
test_X, test_y = ml.create_multiclass_test_data(avg_scores_df, 5_000, three_tfidf)
ml.benchmark(test_X, test_y)
ml.evaluate_classifier(three_classifier, test_X, test_y)

Benchmark accuracy for our model to beat: 0.41831633673265345
Model Accuracy: 0.7034593081383723


In [None]:
# Cross validate kernel for binary classification
best_kernel = ml.three_way_cross_validation(train_X, train_y)

print(f'the {best_kernel} kernel gives the highest accuracy')

Model Accuracy: 0.7136229101671866
Accuracy of linear kernel on split 0: None
