In [5]:
import json

f = open('labeled.json','r')
brv=[]

for line in f:
    af = json.loads(line)
    brv.append(af)

In [6]:
with open('vocab_50.json', 'r') as f:
    vocabulary = json.load(f)
    p = len(vocabulary.keys())

In [7]:
import pandas as pd

df = pd.DataFrame(brv)
indicator = (df['overall'] > 14).astype(int)
by_beer = df[['beer_name', 'overall']].groupby('beer_name')
by_brewer = df[['brewer', 'overall']].groupby('brewer')

## Data Inspection 

In [8]:
beer_mean = by_beer.mean()
beer_median = by_beer.median()
beer_sd = by_beer.std()


brewer_mean = by_brewer.mean()
brewer_median = by_brewer.median()
brewer_std = by_brewer.std()

In [9]:
import numpy as np
beer_mean.columns = ['Beer_mean']
beer_median.columns = ['Beer_median']
beer_sd.columns = ['Beer_sd']

brewer_mean.columns = ['brewer_mean']
brewer_median.columns = ['brewer_median']
brewer_std.columns = ['brewer_std']

beer = beer_mean.join([beer_median, beer_sd],how='outer')
beer = beer[np.isfinite(beer['Beer_sd'])]
brewer = brewer_mean.join([brewer_median, brewer_std], how='outer')
brewer = brewer[np.isfinite(brewer['brewer_std'])]

In [10]:
n = 10
brewer.head(n)
brewer.describe()

Unnamed: 0,brewer_mean,brewer_median,brewer_std
count,6958.0,6958.0,6958.0
mean,12.149545,12.331417,2.346387
std,1.995206,2.086554,0.909007
min,1.0,1.0,0.0
25%,11.333333,12.0,1.844266
50%,12.445528,13.0,2.301872
75%,13.386201,14.0,2.801796
max,19.25,19.5,9.899495


Thus, the standard deviation is generally low with the 50th percentile standard deviation at 2.34. Hence all people have somewhat similar taste when it comes breweries.

In [11]:
n = 10
beer.head(n)
beer.describe()

Unnamed: 0,Beer_mean,Beer_median,Beer_sd
count,72858.0,72858.0,72858.0
mean,12.648817,12.754262,1.9158
std,2.311643,2.387881,1.056971
min,1.0,1.0,0.0
25%,11.545455,12.0,1.258306
50%,13.0,13.0,1.81932
75%,14.2,14.0,2.445599
max,19.5,20.0,12.727922


Thus, the standard deviation is generally low with the 50th percentile standard deviation at 1.92. Hence all people have somewhat similar taste when it comes to beers.

## Sentiment Analysis

### (a) Generating Features

In [12]:
from scipy.sparse import csr_matrix
import string 
import re


def clean_and_split(s):
     # encode to UTF-8, convert to lowercase and translate all hyphens and
     # punctuation to whitespace
    s = s.encode('utf-8').lower().replace('-',' ').translate(None, string.punctuation)
     # replace \r\n
    s = re.sub('(\r\n)+',' ',s)
     # replace whitespace substrings with one whitespace and remove
     # leading/trailing whitespaces
    s = re.sub(' +',' ',s.strip())
    return s.split(' ')

In [13]:
i = 0
for word in vocabulary:
    vocabulary[word] = (vocabulary[word], i)
    i += 1

In [14]:
reviews = df['review'].map(clean_and_split)

In [15]:
n = 5
reviews.head(n)

0    [on, tap, at, the, springfield, pa, location, ...
1    [on, tap, at, the, john, harvards, in, springf...
2    [updated, feb, 19, 2003, springfield, pa, ive,...
3    [on, tap, the, springfield, pa, location, bill...
4    [springfield, pa, location, poured, an, opaque...
Name: review, dtype: object

In [16]:
def features(reviews, vocab):
    indptr = [0]
    indices = [] 
    data = []
    for idx, d in pd.Series.iteritems(reviews):
        for term in d:
            if term in vocab:
                index = vocab[term][1]
                indices.append(index)
                data.append(1)
        indptr.append(len(indices))
    print(len(data), len(indices), len(indptr))
    return csr_matrix((data, indices, indptr), dtype=int)

In [17]:
train_index = int(reviews.shape[0] * 0.7)
val_index = int(reviews.shape[0] * 0.85)

train_reviews = reviews[:train_index]
train_indicators = indicator[:train_index]

val_reviews = reviews[train_index+1:val_index]
val_indicators = indicator[train_index + 1:val_index]

test_reviews = reviews[val_index:]
test_indicators = indicator[val_index:]

In [18]:
val_features = features(val_reviews, vocabulary)

(10698811, 10698811, 372952)


In [19]:
train_features = features(train_reviews, vocabulary)

(49010443, 49010443, 1740444)


### (b) Logistic Regression using Newton's Method

In [None]:
from sklearn.linear_model import LogisticRegression
import time

lm = LogisticRegression(penalty='l2', C=0.001)
t0 = time.time()
lm.fit(train_features, train_indicators)
print("Training time: ", time.time() - t0)

### Linear SVC 

In [28]:
from sklearn.svm import LinearSVC

svc = LinearSVC(loss='hinge', C=0.01)
t0 = time.time()
svc.fit(train_features, train_indicators)
print("Training time: ", time.time() - t0)

('Training time: ', 137.30983901023865)


### Logistic Loss v Hinge Loss

In [29]:
from sklearn.metrics import accuracy_score

y_pred_lm = lm.predict(val_features)
acc_lm = accuracy_score(val_indicators, y_pred_lm, normalize=False) / float(val_indicators.size)
y_pred_svc = svc.predict(val_features)
acc_svc = accuracy_score(val_indicators, y_pred_svc, normalize=False) / float(val_indicators.size)

In [30]:
print("Logistic Regrssion Accuracy: %f" % acc_lm)

Logistic Regrssion Accuracy: 0.780030


In [31]:
print("LinearSVC Accuracy %f" % acc_svc)

LinearSVC Accuracy 0.789492


Varying C changes the accuracy of both models. I find the highest prediction accuracy when C is at 1, however this creates a tradeoff as the time taken increases. A higher C leads to a much higher model.

### (c) Stochastic Gradient Descent 

In [32]:
import numpy as np

def log_likelihood(features, target, weights):
    scores = np.dot(features, weights)
    ll = np.sum( target*scores - np.log(1 + np.exp(scores)) )
    return ll

def logit(scores):
    return 1 / (1 + (1 + np.expm1(-scores)))

def logistic_regression(features, target, num_steps, learning_rate):
        
    weights = np.zeros(features.shape[1])
    
    for step in xrange(num_steps):
        scores = np.dot(features, weights)
        predictions = logit(scores)

        output_error_signal = target - predictions
        gradient = np.dot(features.T, output_error_signal)
        weights += learning_rate * gradient
        
        if step % 10000 == 0:
            print log_likelihood(features, target, weights)
        
    return weights

In [33]:
weights = logistic_regression(val_features[:1000], val_indicators[:1000], num_steps=3000, learning_rate=5e-5)

In [38]:
score_features = df[['appearance', 'aroma', 'palate', 'style', 'taste']]
score_logit = LogisticRegression(penalty='l2', C=0.001)

train_index = int(reviews.shape[0] * 0.7)
val_index = int(reviews.shape[0] * 0.85)

In [40]:
val_scores = score_features[train_index+1:val_index]

score_logit.fit(val_features, val_indicators)
y_pred_all = score_logit.predict(val_scores)
acc_all = accuracy_score(val_indicators, y_pred_all, normalize=False) / float(val_indicators.size)
print("Logistic Regrssion Accuracy: %f" % acc_all)

KeyboardInterrupt: 