### 1. Import the necessary libraries.

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)

### 2. Load the Amazon, Yelp, and IMDb datasets.

In [2]:
%cd /Users/kaylanguyen/Documents/TheNLPWorkshop/Chapter8/

/Users/kaylanguyen/Documents/TheNLPWorkshop/Chapter8


In [3]:
IMDB_DATA_FILE = 'imdb_labelled.txt'
YELP_DATA_FILE = 'imdb_labelled.txt'
AMAZON_DATA_FILE = 'amazon_cells_labelled.txt'

In [4]:
COLUMN_NAMES = ['Review', 'Sentiment']

In [5]:
imdb_reviews = pd.read_table(IMDB_DATA_FILE, names=COLUMN_NAMES)
imdb_reviews.shape

(748, 2)

In [6]:
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)
amazon_reviews.shape

(1000, 2)

In [7]:
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)
yelp_reviews.shape

(748, 2)

### 3. Concatenate the datasets and take out a random sample of 10 items 

In [8]:
all_datasets = [imdb_reviews, amazon_reviews, yelp_reviews]
data = pd.concat(all_datasets)
data.shape

(2496, 2)

In [9]:
data.tail(20)

Unnamed: 0,Review,Sentiment
728,Judith Light is one of my favorite actresses and I think she does a superb job in this film!,1
729,I keep watching it over and over.,1
730,"It's a sad movie, but very good.",1
731,"If you have not seen this movie, I definitely recommend it!",1
732,"She is as lovely as usual, this cutie!",1
733,Still it's quite interesting and entertaining to follow.,1
734,;) Recommend with confidence!,1
735,This movie is well-balanced with comedy and drama and I thoroughly enjoyed myself.,1
736,It was a riot to see Hugo Weaving play a sex-obsessed gay real estate salesman who uses his clients' houses for his trysts with the flaming Darren (Tom Hollander).,1
737,":) Anyway, the plot flowed smoothly and the male-bonding scenes were a hoot.",1


In [10]:
data.sample(10)

Unnamed: 0,Review,Sentiment
33,I have two more years left in this contract and I hate this phone.,0
896,Great phone.,1
174,The file browser offers all the options that one needs.Handsfree is great.,1
22,"Overall, the film is interesting and thought-provoking.",1
195,Having to humour him just to get by and get through the day was so apt.,1
666,"I agree with Jessica, this movie is pretty bad.",0
154,It is shameful.,0
62,These headphones were a great find - and I think they are perhaps the best purchase I've made in the last several years - seriously.,1
414,After a year the battery went completely dead on my headset.,0
157,I must say I have taped most of the episodes and i find myself watching them over and over again.,1


In [11]:
data.Sentiment.value_counts()

1    1272
0    1224
Name: Sentiment, dtype: int64

### 4. Create a function for preprocessing the text, that is, convert the words into lowercase and normalize them.

In [12]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not').replace('wouldn t', 'would not')
    return text

### 5. Apply the function created in the previous step on the dataset.

In [13]:
review_model_data = data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [14]:
review_model_data.sample(10)

Unnamed: 0,Review,Sentiment
737,anyway the plot flowed smoothly and the male bonding scenes were a hoot,1
731,if you have not seen this movie i definitely recommend it,1
590,sprint terrible customer service,0
217,i also did not like the on button it felt like it would crack with use,0
348,the bipolarity of the ruthless thug one minute a ruthless killer the next minute a luv s diaper commercial is completely unconvincing,0
452,nothing short of magnificent photography cinematography in this film,1
95,will order from them again,1
506,delete this film from your mind,0
43,too politically correct,0
715,the phone crashed completely and now i have to get it replaced,0


### 6. Use TfidfVectorizer to convert the review text into TFIDF vectors and use the LogisticRegression class to create a model that uses logistic regression for the model. These should be combined into Pipeline object.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [16]:
tfidf = TfidfVectorizer()
log_reg = LogisticRegression()
log_tfidf = Pipeline([('vect', tfidf), 
                      ('clf', log_reg)])

### 7. Now split the data into train and test sets, using 70% to train the data and 30% to test the data.

In [17]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review,
                                                    review_model_data.Sentiment,
                                                    test_size=0.3,
                                                    random_state=42)

### 8. Use the fit() function to fit the training data on pipeline.

In [18]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', LogisticRegression())])

### 9. Print the accuracy score.

In [19]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 87%'

### 10. Test the model on these sentences: "I loved this place" and "I hated this place".

In [20]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0])