In [25]:
import pandas as pd 
import numpy as np 
import os 
import sklearn 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [12]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

N, n_cols = x_train_df.shape
tr_text_list = x_train_df['text'].values.tolist()


In [13]:
bow_preprocessor = CountVectorizer(max_df = 0.4, min_df=1,binary=False)
lr_model = LogisticRegression(C= 3.5111917342151275)
pipeline = sklearn.pipeline.Pipeline([('my_bow_feature_extractor', bow_preprocessor), 
                                      ('my_classifier', lr_model)])

In [16]:
# y_whole_train_array = y_train_df.to_numpy()
# y_whole_train_array = y_whole_train_array.reshape((2400,))  #converting input y df to array
# x_whole_train_array = np.asarray(tr_text_list) #converting input x list to array

y_array = y_train_df.to_numpy()
y_array = y_array.reshape((2400,))
x_array = np.asarray(tr_text_list)

['Oh and I forgot to also mention the weird color effect it has on your phone.'
 "THAT one didn't work either." 'Waste of 13 bucks.' ...
 'Ambience is perfect.'
 'We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.'
 'Service was good and the company was better!']


In [46]:
x_train, x_valid, y_train, y_valid = train_test_split(x_array, y_array, test_size=0.2, random_state=10)

["Save your money.... I've had this item for 11 months now."
 'I especially liked the non-cliche choices with the parents; in other movies, I could predict the dialog verbatim, but the writing in this movie made better selections.  '
 'Worked perfectly!' ...
 "Plus, with the movie's rather modest budget and fast running time, it does an amazing job!  "
 'Excellent product, I am very satisfied with the purchase.'
 'It was clear that she had the range and ability to pull off this part.  ']


In [43]:
pipeline.fit(x_train, y_train)
print(bow_preprocessor.vocabulary_)
yhat_valid = pipeline.predict(x_valid)
yhat_probas = pipeline.predict_proba(x_valid)
y_valid = y_valid.reshape((480,))


print(roc_auc_score(y_valid, yhat_probas[:,1]))

0.8993230920314254


In [23]:
for i in range(len(y_valid)):
    if (yhat_valid[i] == 0) & (y_valid[i] == 1):
        print("FN: " + x_valid[i])
        
    elif (yhat_valid[i] == 1) & (y_valid[i] ==0):
        print("FP:"+ x_valid[i])

FP:I give it 2 thumbs down
FP:It's a shame to see good actors like Thomerson and James make a living in a mess like this.  
FP:I can't see how this movie can be an inspiration to anyone to come out or overcome fear and rejection.  
FN: The acting, as you'd expect from this cast, is top notch.  
FN: * Comes with a strong light that you can use to light up your camera shots, and even flash SOS signals (seriously!
FN: While you don't yet hear Mickey speak, there are tons of sound effects and music throughout the film--something we take for granted now but which was a huge crowd pleaser in 1928.  
FN: I gave it 5 stars then, and I'm giving it 5 stars now.
FP:But even the talented Carrell can't save this.  
FP:Excellent starter wireless headset.
FP:For a product that costs as much as this one does, I expect it to work far better and with greater ease than this thing does.
FN: If you're not familiar, check it out.
FP:Can't store anything but phone numbers to SIM.
FP:I have two more years lef

In [32]:
for i in range(len(y_valid)):
    if (yhat_valid[i] == 1) & (y_valid[i] == 1):
        print("TP: " + x_valid[i])
        
    elif (yhat_valid[i] == 1) & (y_valid[i] ==0):
        print("FP:"+ x_valid[i])

TP: The pancake was also really good and pretty large at that.
FP:I give it 2 thumbs down
TP: This is an extraordinary film.  
TP: You truly take this journey through the eyes and soul of a child.  
TP: However, after seeing the short again after about 25 years, I was amazed at how timeless the film actually is.  
FP:It's a shame to see good actors like Thomerson and James make a living in a mess like this.  
FP:I can't see how this movie can be an inspiration to anyone to come out or overcome fear and rejection.  
TP: It was delicious!!!
TP: One of my favorite purchases ever.
TP: The cow tongue and cheek tacos are amazing.
TP: fast service.
TP: Product is exactly as described.
TP: The best phone in market :).
TP: Ample portions and good prices.
TP: They really want to make your experience a good one.
TP: I am very happy
TP: The story unfolds in 18th century Jutland and the use of period music played on period instruments is just one more fine touch.  
TP: Vivian Schilling did an excel

In [33]:
for i in range(len(y_valid)):
    if (yhat_valid[i] == 0) & (y_valid[i] == 0):
        print("TN: " + x_valid[i])
        
    elif (yhat_valid[i] == 0) & (y_valid[i] ==1):
        print("FN:"+ x_valid[i])

TN: My girlfriend's veal was very bad.
TN: I also decided not to send it back because our waitress looked like she was on the verge of having a heart attack.
TN: But the acting--even that of such professionals as Drago and Debbie Rochon--was terrible, the directing worse (perhaps contributory to the former), the dialog chimp-like, and the camera work, barely tolerable.  
TN: Too politically correct.  
TN: Not good for the money.
FN:The acting, as you'd expect from this cast, is top notch.  
TN: The charger arrived within the promised timeframe, but it did not work.
TN: Worst ever.
TN: I found this product to be waaay too big.
TN: There was absolutely no warmth or charm to these scenes or characters.  
TN: I cannot believe that the actors agreed to do this "film".  
TN: My father has the V265, and the battery is dying.
FN:* Comes with a strong light that you can use to light up your camera shots, and even flash SOS signals (seriously!
FN:While you don't yet hear Mickey speak, there are 

In [45]:
print(x_train)

["Save your money.... I've had this item for 11 months now."
 'I especially liked the non-cliche choices with the parents; in other movies, I could predict the dialog verbatim, but the writing in this movie made better selections.  '
 'Worked perfectly!' ...
 "Plus, with the movie's rather modest budget and fast running time, it does an amazing job!  "
 'Excellent product, I am very satisfied with the purchase.'
 'It was clear that she had the range and ability to pull off this part.  ']


In [None]:
try:
    file = open('x_train.txt', 'w')
    x_train[:].tofile(file, sep="\n")
except Exception as e: 
    print("error: " + str(e))
finally: 
    file.close()  