### Loading Dataset

In [1]:
import pandas as pd
import numpy as np
import os
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

N, n_cols = x_train_df.shape
print("Shape of x_train_df: (%d, %d)" % (N,n_cols))
print("Shape of y_train_df: %s" % str(y_train_df.shape))

# Print out the first five rows and last five rows
tr_text_list = x_train_df['text'].values.tolist()
rows = np.arange(0, 5)
for row_id in rows:
    text = tr_text_list[row_id]
    print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))

print("...")
rows = np.arange(N - 5, N)
for row_id in rows:
    text = tr_text_list[row_id]
    print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))

Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)
row     0 | y = 0 | Oh and I forgot to also mention the weird color effect it has on your phone.
row     1 | y = 0 | THAT one didn't work either.
row     2 | y = 0 | Waste of 13 bucks.
row     3 | y = 0 | Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.
row     4 | y = 0 | None of the three sizes they sent with the headset would stay in my ears.
...
row  2395 | y = 1 | The sweet potato fries were very good and seasoned well.
row  2396 | y = 1 | I could eat their bruschetta all day it is devine.
row  2397 | y = 1 | Ambience is perfect.
row  2398 | y = 1 | We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.
row  2399 | y = 1 | Service was good and the company was better!


In [3]:
bow_preprocessor = CountVectorizer(binary=False)
lr_model = LogisticRegression()
pipeline = sklearn.pipeline.Pipeline([('my_bow_feature_extractor', bow_preprocessor), 
                                      ('my_classifier', lr_model)])

In [26]:
my_param_grid = dict()
my_param_grid['my_bow_feature_extractor__min_df'] = [1, 2, 3, 4]
my_param_grid['my_bow_feature_extractor__max_df'] = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
my_param_grid['my_classifier__C'] = np.logspace(-6, 6, 12)

In [27]:
grid_search = sklearn.model_selection.GridSearchCV(
    pipeline,
    my_param_grid,
    scoring='roc_auc',
    cv=5,
    refit=False)

In [28]:
y_train_array = y_train_df.to_numpy()
y_train_array = y_train_array.reshape((2400,))  #converting input y df to array
x_train_array = np.asarray(tr_text_list) #converting input x list to array

In [29]:
%%capture
grid_search.fit(x_train_array, y_train_array)

In [30]:
grid_search.best_score_

0.871875

In [31]:
grid_search.best_params_

{'my_bow_feature_extractor__max_df': 0.5,
 'my_bow_feature_extractor__min_df': 1,
 'my_classifier__C': 3.5111917342151275}

In [10]:
pipeline.set_params(my_bow_feature_extractor__max_df=0.4, my_bow_feature_extractor__min_df=1, my_classifier__C=3.5111917342151275)

In [11]:
pipeline.fit(x_train_array, y_train_array)
yhat_train = pipeline.predict(x_train_array)
yhat_train_probas = pipeline.predict_proba(x_train_array)


### Loading Testing Set for prediction

In [13]:
x_test_df = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))
x_test_array = x_test_df['text'].to_numpy()

In [39]:
y_test = pipeline.predict(x_test_array)
y_test_probas = pipeline.predict_proba(x_test_array)
y_test_probas = np.asarray(y_test_probas)


In [40]:
#writing probas to a file
try:
    file = open('yproba1_test.txt', 'w')
    y_test_probas[:, 1].tofile(file, sep="\n")
except Exception as e: 
    print("error: " + str(e))
finally: 
    file.close()  

In [25]:
#Getting Vocabulary Siz
bow_vectorizer = pipeline.steps[0][1]


4509

#### Grid Search over C only

In [40]:
%%capture
bow_preprocessor_max_mindf= CountVectorizer(binary=False, max_df=0.4, min_df=1)
lr_model2 = LogisticRegression()
pipeline2 = sklearn.pipeline.Pipeline([('bow_feature_extractor', bow_preprocessor_max_mindf), ('classifier', lr_model2)])
my_param_grid2 = dict()
my_param_grid2['classifier__C'] = my_param_grid['my_classifier__C']

grid_search2 = sklearn.model_selection.GridSearchCV(
    pipeline2,
    my_param_grid2,
    scoring='roc_auc',
    cv = 5,
    refit=False,
    return_train_score=True
)

grid_search2.fit(x_train_array, y_train_array)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [41]:
grid_search2.cv_results_

{'mean_fit_time': array([0.02851381, 0.02559648, 0.03182087, 0.03899345, 0.05563397,
        0.08057051, 0.08562164, 0.09136195, 0.09021249, 0.09075012,
        0.11935306, 0.09866986]),
 'std_fit_time': array([0.00241748, 0.00025617, 0.00198993, 0.01185433, 0.01487249,
        0.01989871, 0.01866491, 0.00382586, 0.00302678, 0.00293882,
        0.02505023, 0.00806945]),
 'mean_score_time': array([0.00565677, 0.00501633, 0.00585661, 0.00715785, 0.0099772 ,
        0.0119514 , 0.00596972, 0.00620403, 0.00601177, 0.00654607,
        0.00756536, 0.00719037]),
 'std_score_time': array([0.00026887, 0.00040281, 0.00065932, 0.00253386, 0.00246961,
        0.00309644, 0.0004568 , 0.00067258, 0.00048203, 0.00115177,
        0.00195764, 0.00186498]),
 'param_classifier__C': masked_array(data=[1e-06, 1.2328467394420658e-05, 0.0001519911082952933,
                    0.001873817422860383, 0.02310129700083158,
                    0.2848035868435799, 3.5111917342151275,
                    43.2876128

In [None]:
# C indices: 6: Best, Underfitting 1, 4, Overfitting: 8, 11


In [7]:
# Creating separate dataframes for each website in case necessary
amazon_reviews_df = x_train_df[x_train_df['website_name'] == 'amazon']
yelp_reviews_df = x_train_df[x_train_df['website_name'] == 'yelp']
imdb_reviews_df = x_train_df[x_train_df['website_name'] == 'imdb']
