In [17]:
import numpy
import urllib
import scipy.optimize
import random
from math import exp
from math import log

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
data = list(parseData("file:beer_50000.json"))
print "done"

Reading data...
done


In [44]:
def feature(datum):
  text = datum['review/text'].lower().replace(',',' ').replace('?',' ')\
          .replace('!',' ').replace(':',' ').replace('"',' ').replace('.',' ')\
          .replace('(',' ').replace(')',' ').split()
  num_lactic = 0
  num_tart = 0
  num_sour = 0
  num_citric = 0
  num_sweet = 0
  num_acid = 0
  num_hop = 0
  num_fruit = 0
  num_salt = 0
  num_spicy = 0
  for word in text:
    if word == 'lactic':  num_lactic += 1
    if word == 'tart':  num_tart += 1
    if word == 'sour':  num_sour += 1
    if word == 'citric':  num_citric += 1
    if word == 'sweet':  num_sweet += 1
    if word == 'acid':  num_acid += 1
    if word == 'hop':  num_hop += 1
    if word == 'fruit':  num_fruit += 1
    if word == 'salt':  num_salt += 1
    if word == 'spicy':  num_spicy += 1
        
  feat = [1, num_lactic, num_tart, num_sour, \
         num_citric, num_sweet, num_acid, num_hop, \
         num_fruit, num_salt, num_spicy] 
  return feat

X = [feature(d) for d in data]
y = [d['beer/ABV'] >= 6.5 for d in data]

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + exp(-x))

In [45]:
print data[1293]['review/text']

People drink for two main reasons: 1) to enjoy a high quality brew or 2) to get drunk. Which leaves me to question who would drink this crap? It would take a whole lot to get drunk off this super light beer. Also there is no real taste. Their ad says "the beer drinker's light beer." Well maybe that's true, if the beer drinker normally drinks Coors. Very expensive. Ripoff!


In [46]:
length = int(len(data)/3)

X_train = X[:length]
y_train = y[:length]

X_validation = X[length:2*length]
y_validation = y[length:2*length]

X_test = X[2*length:]
y_test = y[2*length:]

In [47]:
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  # for debugging
  # print("ll =" + str(loglikelihood))
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      dl[k] += X[i][k] * (1 - sigmoid(logit))
      if not y[i]:
        dl[k] -= X[i][k]
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])

def train(lam):
  theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
  return theta

lam = 1.0
theta = train(lam)
print theta

[ 0.05661296  0.0126493   0.05153544 -0.09783307 -0.03142404  0.28601947
  0.02807208 -0.01155924  0.40824656 -0.00520071  0.0071504 ]


In [48]:
def TP(theta):
  scores = [inner(theta,x) for x in X_test]
  predictions = [s > 0 for s in scores]
  correct = [((a==1) and (b==1)) for (a,b) in zip(predictions,y_test)]
  tp = sum(correct) * 1.0
  return tp

tp = TP(theta)
print("lambda = " + str(lam) + ":\ttrue positive=" + str(tp))

lambda = 1.0:	true positive=5831.0


In [49]:
def TN(theta):
  scores = [inner(theta,x) for x in X_test]
  predictions = [s > 0 for s in scores]
  correct = [((a==0) and (b==0)) for (a,b) in zip(predictions,y_test)]
  tn = sum(correct) * 1.0
  return tn

tn = TN(theta)
print("lambda = " + str(lam) + ":\ttrue negative=" + str(tn))

lambda = 1.0:	true negative=210.0


In [50]:
def FP(theta):
  scores = [inner(theta,x) for x in X_test]
  predictions = [s > 0 for s in scores]
  correct = [((a==1) and (b==0)) for (a,b) in zip(predictions,y_test)]
  fp = sum(correct) * 1.0
  return fp

fp = FP(theta)
print("lambda = " + str(lam) + ":\tfalse positive=" + str(fp))

lambda = 1.0:	false positive=10545.0


In [51]:
def FN(theta):
  scores = [inner(theta,x) for x in X_test]
  predictions = [s > 0 for s in scores]
  correct = [((a==0) and (b==1)) for (a,b) in zip(predictions,y_test)]
  fn = sum(correct) * 1.0
  return fn

fn = FN(theta)
print("lambda = " + str(lam) + ":\tfalse negative=" + str(fn))

lambda = 1.0:	false negative=82.0


In [52]:
TPR = tp / (tp + fn)
TNR = tn / (tn + fp)
BER = 1 - 0.5 * (TPR + TNR)
print 'Test Data'
print 'True Positives = ' + str(tp)
print 'True Negatives = ' + str(tn)
print 'False Positives = ' + str(fp)
print 'False Negatives = ' + str(fn)
print 'Accuracy = ' + str((tp+tn)/(tp+tn+fp+fn))
print 'BER = ' + str(BER)

Test Data
True Positives = 5831.0
True Negatives = 210.0
False Positives = 10545.0
False Negatives = 82.0
Accuracy = 0.36243100552
BER = 0.497170973537


In [53]:
X_data = [X_train, X_validation, X_test]
y_data = [y_train, y_validation, y_test]
symbol = ['train', 'valid', 'test']
print 'λ\tDataset\t\tTruePositive\tFalsePositive\tTrueNegative\tFalseNegative\tAccuracy\tBER'

for lam in [1]:
    theta = train(lam)
    #print theta
    for i in range(3):
        def TP(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==1) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
          tp = sum(correct) * 1.0
          return tp

        def TN(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==0) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
          tn = sum(correct) * 1.0
          return tn

        def FP(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==1) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
          fp = sum(correct) * 1.0
          return fp

        def FN(theta):
          scores = [inner(theta,x) for x in X_data[i]]
          predictions = [s > 0 for s in scores]
          correct = [((a==0) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
          fn = sum(correct) * 1.0
          return fn
        
        if i == 2 :
            tp = TP(theta)
            fp = FP(theta)
            tn = TN(theta)
            fn = FN(theta)
            TPR = tp / (tp + fn)
            TNR = tn / (tn + fp)
            BER = 1 - 0.5 * (TPR + TNR)
            accuracy = (tp+tn)/(tp+tn+fp+fn)
            print str(lam)+'\t'+symbol[i]+'\t\t'+str(tp)+'\t\t'+str(fp)+'\t\t'+str(tn)+'\t\t'+str(fn)+'\t\t'+str(accuracy)+'\t'+str(BER)

λ	Dataset		TruePositive	FalsePositive	TrueNegative	FalseNegative	Accuracy	BER
1	test		5831.0		10545.0		210.0		82.0		0.36243100552	0.497170973537
