In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, confusion_matrix

In [3]:
q2 = review_df.score.quantile(0.5)
q1 = review_df.score.quantile(0.25)
q3 = review_df.score.quantile(0.75)

def binary_classification(score):
    if score <= q2:
        return 0 #bad
    else:
        return 1 #great
def finer_classification(score):
    if score <= q1: 
        return 0 #bad
    elif score <= q2:
        return 1 #poor
    elif score <= q3:
        return 2 #good
    else:
        return 3 #great

In [4]:
content_df = pd.read_csv('Data/content.csv')
review_df = pd.read_csv('Data/reviews.csv')
text = content_df.content.fillna(' ').values
labels = review_df.score.apply(lambda x: binary_classification(x))

In [5]:
text_train, text_test, y_train, y_test = train_test_split(text, labels, test_size=0.2, random_state=1000)

In [6]:
#Tokenize Review Content
tokenizer = CountVectorizer()
x_train = tokenizer.fit_transform(text_train)
x_test = tokenizer.transform(text_test)

In [None]:
import xgboost as xgb

classifier = xgb.XGBClassifier(use_label_encoder=False)
classifier.fit(x_train.toarray(),y_train)

In [None]:
print(f'\nxgb Log Loss: {log_loss(y_test, classifier.predict_proba(x_test))}')
print(f'\nxgb Confusion Matrix:\n {confusion_matrix(y_test, classifier.predict(x_test))}')

In [138]:
#Finer Case
classifier = DummyClassifier(strategy = 'uniform')
classifier.fit(x_train, y_train)

print(f'\nRandom Log Loss: {log_loss(y_test, classifier.predict_proba(x_test))}')
print(f'\nRandom Confusion Matrix:\n {confusion_matrix(y_test, classifier.predict(x_test))}')

classifier = LogisticRegression(max_iter = 1000)
classifier.fit(x_train, y_train)
print(f'\nLR Log Loss: {log_loss(y_test, classifier.predict_proba(x_test))}')
print(f'\nLR Confusion Matrix:\n {confusion_matrix(y_test, classifier.predict(x_test))}')


Random Log Loss: 1.3862943611198904

Random Confusion Matrix:
 [[220 233 242 243]
 [232 226 217 260]
 [223 227 243 233]
 [214 235 205 222]]

LR Log Loss: 2.2102926354774817

LR Confusion Matrix:
 [[538 217 109  74]
 [226 328 248 133]
 [106 269 341 210]
 [ 64 140 242 430]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [134]:
#Binary Case
classifier = DummyClassifier(strategy = 'uniform')
classifier.fit(x_train, y_train)

print(f'\nRandom Log Loss: {log_loss(y_test, classifier.predict_proba(x_test))}')
print(f'\nRandom Confusion Matrix:\n {confusion_matrix(y_test, classifier.predict(x_test))}')

classifier = LogisticRegression(max_iter = 1000)
classifier.fit(x_train, y_train)
print(f'\nLR Log Loss: {log_loss(y_test, classifier.predict_proba(x_test))}')
print(f'\nLR Confusion Matrix:\n {confusion_matrix(y_test, classifier.predict(x_test))}')


Random Log Loss: 0.6931471805599452

Random Confusion Matrix:
 [[960 913]
 [903 899]]

LR Log Loss: 0.993460480086278

LR Confusion Matrix:
 [[1335  538]
 [ 546 1256]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
