# LAB 9

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

PRACTICE TASK

In [2]:


X = ['offer secret', 'click secret link', 'secret sports link', 'play sports today', 'went play sports', 'secret sports event', 'sports today', 'sports costs money']
Y = [1,1,1,0,0,0,0,0] # review labels. 1 indicate spam, 0 non-spam

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool. 
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 100) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
X = vectorizer.fit_transform(X)

# Numpy arrays are easy to work with, so convert the result to an 
# array
X = X.toarray()

print(X.shape)
print(vectorizer.vocabulary_)



(8, 11)
{'offer': 5, 'secret': 7, 'click': 0, 'link': 3, 'sports': 8, 'play': 6, 'today': 9, 'went': 10, 'event': 2, 'costs': 1, 'money': 4}


In [3]:
clf = MultinomialNB(alpha=0.000001) # alpha=0 means no laplace smoothing
clf.fit(X, np.array(Y))

test_reviews = ['sports', 'secret secret', 'today secret']
# bag of word representation
tX = vectorizer.transform(test_reviews).toarray()
# prediction
print(clf.predict(tX))

print(clf.predict_proba(tX) )

[0 1 0]
[[8.26446251e-01 1.73553749e-01]
 [5.70208051e-02 9.42979195e-01]
 [9.99997244e-01 2.75623594e-06]]


# LAB TASK

In [11]:
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

def review_to_words(raw_review):
    #1. Remove HTML
    review_bs_obj = BeautifulSoup(raw_review)
    review = review_bs_obj.get_text()
    #2. Remove non letters
    review = re.sub('[^A-Za-z]+',' ', review)
    #3. Convert to lowercase and split it into words
    review_words = review.lower().split()
    #4. Remove stops words
    stops = set(stopwords.words('english'))
    review_words = [w for w in review_words if not w in stops] 
    #5. Joint back and return the joined sentence
    review_sentence = " ".join(review_words)
    return review_sentence

for row in range(len(train)):
    train.at[row, 'review'] = review_to_words(train.at[row, 'review'])
    if(row%1000 == 0):
        print(row)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000


In [16]:
vocabulary_sizes = [3000, 5000]
alpha_values = [0.00001, 5]

train_size = 20000
X_train = train['review'][:train_size]
Y_train = train['sentiment'][:train_size]
X_test = train['review'][train_size:]
Y_test = train['sentiment'][train_size:]

training_X = vectorizer.fit_transform(X_train).toarray()
testing_X = vectorizer.fit_transform(X_test).toarray()



In [18]:
for vocabulary_size in vocabulary_sizes:
    for alpha_value in alpha_values:
        
        print("Vocabulary size: ", vocabulary_size, ", Alpha: ", alpha_value)
        
        vectorizer = CountVectorizer(analyzer = "word", \
        tokenizer = None, \
        preprocessor = None, \
        stop_words = None, \
        max_features = vocabulary_size)

        clf = MultinomialNB(alpha=alpha_value) # alpha=0 means no laplace smoothing
        clf.fit(training_X, Y_train)

        predictions_train_data = clf.predict(training_X)
        compare_train = predictions_train_data == Y_train
        print("Training accuracy: ", compare_train.mean())
        
        predictions_test_data = clf.predict(testing_X)
        compare_test = predictions_test_data == Y_test
        print("Testing accuracy: ", compare_test.mean(), "\n")


Vocabulary size:  3000 , Alpha:  1e-05
Training accuracy:  0.86405
Testing accuracy:  0.5302 

Vocabulary size:  3000 , Alpha:  5
Training accuracy:  0.8636
Testing accuracy:  0.5276 

Vocabulary size:  5000 , Alpha:  1e-05
Training accuracy:  0.86405
Testing accuracy:  0.5302 

Vocabulary size:  5000 , Alpha:  5
Training accuracy:  0.8636
Testing accuracy:  0.5276 

