In [117]:
import pickle
import re
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model  import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import warnings
warnings.filterwarnings("ignore")


# The Task: Beer Sentiment Analysis

Given long and detailed beer reviews, we want to predict if the reviewed ranked it as an bad, okay or good.


## Step 1: Preprocessing the data
To start off, we're going to load the data from some pickle files and do some simple preprocessing. We'll throw away non-alphanumeric characters and lowercase everything.

i.e
```"Best Beer ever!!!" -> "best beer ever"```

The sanity check the data, we'll look at a few examples.


In [118]:
train_path = "data/beer/overall_train.p"
dev_path   = "data/beer/overall_dev.p"
test_path  = "data/beer/overall_test.p"

train_set =  pickle.load(open(train_path, 'rb'))
dev_set =  pickle.load(open(dev_path, 'rb'))
test_set =  pickle.load(open(test_path, 'rb'))



def preprocess_data(data):
    for indx, sample in enumerate(data):
        text, label = sample['text'], sample['y']
        text = re.sub('\W+', ' ', text).lower().strip()
        data[indx] = text, label
    return data

train_set = preprocess_data(train_set)
dev_set = preprocess_data(dev_set)
test_set =  preprocess_data(test_set)

In [172]:
print("Num Train: {}".format(len(train_set)))
print("Num Dev: {}".format(len(dev_set)))
print("Num Test: {}".format(len(test_set)))
print("Example Tweets:")
print(train_set[0])
print()
print(train_set[1])


Num Train: 20000
Num Dev: 5000
Num Test: 5000
Example Tweets:
('a clear and very pale yellow color poured lt 1 finger fizzy white head very little lacing decanted continually s sweet fruity scent mixed with notes of corn and rice there is a light spicy hops finish but it s very light t strong malty start of sweet fruits such as apples there is a nice blend of corn that does n t bring a bad flavor there is a spicy flavor of hops and medicine the medicinal taste is n t bad at all but adds a little depth to this light adjunct beer m smooth and sweet at first then turns a bit spicy and fizzy finishes very clean crisp and quite refreshing d good better than most light lager', 2)

('ruby redbird served in new belgium globe glass a pours clear pale amber with a fizzy head that fizzles out immediately very good effervescence s smells like i cracked open a vernor s beyond the ginger ale i find no additional aromas t no beer in taste either grapefruit peels to ginger ale then medium bitterness a

## Step 2: Feature Engineering 

How do we represent a review? We're going to use a simple bag of words representation. Meaning we'll represent each review as a vector, and the whole set of reviews as a large matrix.

For example, consider our vocabulary is ```[best, ever, beer, cat, good, dog]```.
The bag of words representation for:
```"best beer ever"``` is ```[1, 1, 1, 0, 0, 0]```
Where one indicates that the vocab words did appear and 0 indicates the words that did not. S

In python, we can do this very easily with ```sklearn.feature_extraction.text.CountVectorizer```

<img src="vectorizer.png">


In [173]:
#Extract tweets and labels into 2 lists
trainText = [t[0] for t in train_set]
trainY = [t[1] for t in train_set]

devText = [t[0] for t in dev_set]
devY = [t[1] for t in dev_set]


testText = [t[0] for t in test_set]
testY = [t[1] for t in test_set]

# Set that word has to appear at least 5 times to be in vocab
min_df = 5
max_features = 1000
countVec = CountVectorizer(min_df = min_df, max_features = max_features )
# Learn vocabulary from train set
countVec.fit(trainText)

# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(trainText)
devX = countVec.transform(devText)
testX = countVec.transform(testText)

In [174]:
print("Shape of Train X {}\n".format(trainX.shape))
print("Sample of the vocab:\n {}".format(np.random.choice(countVec.get_feature_names(), 20)))


Shape of Train X (20000, 1000)

Sample of the vocab:
 ['enjoyed' 'good' 'fruit' 'care' 'use' 'feel' 'note' 'cherries' 'pepper'
 'everything' 'high' 'sits' 'enjoyed' 'pils' 'seems' 'beautiful' 'smokey'
 'while' 'do' 'found']


## Step 3: Pick a model and experiment

Here we'll explore various types of linear models, namely Logistic Regression, Passive Aggressive, and Perceptron. It's very straight-forward
to fit a new classifier and get preliminary results


In [145]:
lr = LogisticRegression()
passAgg    = PassiveAggressiveClassifier()
perceptron = Perceptron()

In [175]:
lr.fit(trainX, trainY)


print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")

Logistic Regression Train: 0.7215
Logistic Regression Dev: 0.6866
--


In [176]:
passAgg.fit(trainX, trainY) 
print("Passive Aggressive Train:", passAgg.score(trainX, trainY))
print("Passive Aggressive Dev:", passAgg.score(devX, devY))
print("--")

Passive Aggressive Train: 0.6755
Passive Aggressive Dev: 0.6294
--


In [177]:
perceptron.fit(trainX, trainY) 
print("Perceptron Train:", perceptron.score(trainX, trainY))
print("Perceptron Dev:", perceptron.score(devX, devY))
print("--")

Perceptron Train: 0.6175
Perceptron Dev: 0.5904
--


## Step 4: Analysis, Debugging the Model
To understand how to make the model better, it's important understand what the model is learning, and what it's getting wrong.

To do this, we can inspect the highest weighted features of our best LR model and look at some examples the model got wrong on the development set. 


In [178]:
lr = LogisticRegression()
lr.fit(trainX, trainY)
print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")

Logistic Regression Train: 0.73415
Logistic Regression Dev: 0.6782
--


In [150]:
print("Intepreting LR")
for label in range(3):
    coefs = lr.coef_[label]
    vocab = np.array(countVec.get_feature_names())
    num_features = 10

    top = np.argpartition(coefs, -num_features)[-num_features:]
    # Sort top
    top = top[np.argsort(coefs[top])]
    s_coef = coefs[top]
    scored_vocab = list(zip(vocab[top], s_coef))
    print("Top weighted features for label {}:\n \n {}\n -- \n".format(label, scored_vocab))

Intepreting LR
Top weighted features for label 0:
 
 [('poor', 0.6536027130614585), ('sips', 0.6823759395892621), ('rest', 0.7083852080854629), ('qualities', 0.7920728750592017), ('disappointed', 0.8111794793649244), ('clarity', 0.8535110675245677), ('terrible', 0.8921344193706889), ('unpleasant', 0.9025258573798056), ('awful', 1.4219141470015884), ('drain', 2.6212975191465695)]
 -- 

Top weighted features for label 1:
 
 [('chewy', 0.4059729562340314), ('warming', 0.41885905106964993), ('silky', 0.43718138460235706), ('heavy', 0.4395226317978089), ('viscous', 0.4571915991167313), ('warmth', 0.46488942889181367), ('oil', 0.4842823050826067), ('filling', 0.5002173093436909), ('sipping', 0.6180110468421688), ('sipper', 1.013004502555345)]
 -- 

Top weighted features for label 2:
 
 [('drinks', 0.6248636435439002), ('summer', 0.667920328401007), ('thirst', 0.753415689183526), ('settles', 0.7771403020805547), ('easily', 0.8312511645437842), ('session', 0.8588465533673536), ('refreshing', 0

In [109]:
## Find erronous dev errors
devPred = lr.predict(devX)
errors = []
for indx in range(len(devText)):
    if devPred[indx] != devY[indx]:
        error = "Review: \n {} \n Predicted: {} \n Correct: {} \n ---".format(
            devText[indx],
            devPred[indx],
            devY[indx])
        errors.append(error)

np.random.seed(1)
print("Random dev error: \n {} \n \n {} \n \n{}".format(
        np.random.choice(errors,1),
        np.random.choice(errors,1),
        np.random.choice(errors,1))
     )

Random dev error: 
 ['Review: \n poured from a growler into a tulip pint glass dry hopped with amarillo and centennial hops according to the menu at the stone company store a hazy light amber color with one finger of persistent white foam head moderate lacing remains on the glass s hops dominate but do not smother with floral and juicy citrus qualities malt is soft and toasty subordinate but not left out while carbonic acid enhances the citric side of the hops and also imparts a crisp edge t begins off dry with variable moderate to pungent hop aromatics of pineapple and savory cooking herbs moderate acidity is also present remaining within acceptable bounds and adding to early brightness the middle and finish are more malt focused yet the footprint of the grain is feather light and bready bitterness is subdued for the style lingering at the edges of the tongue and generally staying out of the way m medium viscosity slightly sticky on the palate with moderate carbonation d o freshness i

## Step 5: Play with regularization

We can see that LogisticRegression so far works the best so far, but it is greatly over fitting. Meaning that it does much better on train than development. A common strategy to dealing with this is adding an extra penalty for model complexity, like the square sum of the model weights. We call this idea regularization. 

In sklearn, it is very easy to test out various regularization amounts and tune the model. The smaller the parameter `C`, the stronger the regularization cost.

In [151]:
lr = LogisticRegression(C=.5)
lr.fit(trainX, trainY)


print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")

lr = LogisticRegression(C=.1)
lr.fit(trainX, trainY)


print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")

Logistic Regression Train: 0.73455
Logistic Regression Dev: 0.6784
--
Logistic Regression Train: 0.7331
Logistic Regression Dev: 0.679
--


In [152]:
lr = LogisticRegression(C=.01)
lr.fit(trainX, trainY)


print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")

Logistic Regression Train: 0.7215
Logistic Regression Dev: 0.6866
--


## Step 6: Adding in Ngrams

How does our model distinguish between the sentiment phrase that says:
```"great flavor and too bad there isn't more."```
versus
```"bad flavor and too great there isn't more."```

In our bag of words model, both have the same vector. In order to capture some of these ordering depency, we generalize the bag-of-words model to take "n-grams" of words that occur in the training set. a "bi-gram" is a pair of words, "tri-gram" triple, etc.

Let see how this imporves our model 


In [166]:
# Set that word has to appear at least 5 times to be in vocab
min_df = 5
ngram_range = (1,3)
max_features = 5000
countVecNgram = CountVectorizer(min_df = min_df, ngram_range = ngram_range, max_features=max_features)
# Learn vocabulary from train set
countVecNgram.fit(trainText)

# Transform list of review to matrix of bag-of-word vectors
trainXNgram = countVecNgram.transform(trainText)
devXNgram = countVecNgram.transform(devText)
testXNgram = countVecNgram.transform(testText)

In [167]:
lrNgram = LogisticRegression(C=1)
lrNgram.fit(trainXNgram, trainY)
print("Logistic Regression Train:", lrNgram.score(trainXNgram, trainY))
print("Logistic Regression Dev:", lrNgram.score(devXNgram, devY))
print("--")

lrNgram = LogisticRegression(C=.5)
lrNgram.fit(trainXNgram, trainY)
print("Logistic Regression Train:", lrNgram.score(trainXNgram, trainY))
print("Logistic Regression Dev:", lrNgram.score(devXNgram, devY))
print("--")

lrNgram = LogisticRegression(C=.1)
lrNgram.fit(trainXNgram, trainY)
print("Logistic Regression Train:", lrNgram.score(trainXNgram, trainY))
print("Logistic Regression Dev:", lrNgram.score(devXNgram, devY))
print("--")

lrNgram = LogisticRegression(C=.01)
lrNgram.fit(trainXNgram, trainY)
print("Logistic Regression Train:", lrNgram.score(trainXNgram, trainY))
print("Logistic Regression Dev:", lrNgram.score(devXNgram, devY))
print("--")

Logistic Regression Train: 0.88255
Logistic Regression Dev: 0.6534
--
Logistic Regression Train: 0.87305
Logistic Regression Dev: 0.6634
--
Logistic Regression Train: 0.8428
Logistic Regression Dev: 0.6864
--
Logistic Regression Train: 0.77645
Logistic Regression Dev: 0.7036
--


## Step 7: Take best model, and report results on Test

In [168]:
print("Logistic Regression Test:", lrNgram.score(testXNgram, testY))


Logistic Regression Test: 0.7088


## Next Steps: Movie Reviews