In [22]:
import numpy
import urllib
import scipy.optimize
import random
from math import *

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
data = list(parseData("file:beer_50000.json"))
print "done"

Reading data...
done


In [35]:
def feature(datum):
  text = datum['review/text'].lower().replace(',',' ').replace('?',' ')\
          .replace('!',' ').replace(':',' ').replace('"',' ').replace('.',' ')\
          .replace('(',' ').replace(')',' ').split()
  num_lactic = 0
  num_tart = 0
  num_sour = 0
  num_citric = 0
  num_sweet = 0
  num_acid = 0
  num_hop = 0
  num_fruit = 0
  num_salt = 0
  num_spicy = 0
  for word in text:
    if word == 'lactic':  num_lactic += 1
    if word == 'tart':  num_tart += 1
    if word == 'sour':  num_sour += 1
    if word == 'citric':  num_citric += 1
    if word == 'sweet':  num_sweet += 1
    if word == 'acid':  num_acid += 1
    if word == 'hop':  num_hop += 1
    if word == 'fruit':  num_fruit += 1
    if word == 'salt':  num_salt += 1
    if word == 'spicy':  num_spicy += 1
        
  feat = [1, num_lactic, num_tart, num_sour, \
         num_citric, num_sweet, num_acid, num_hop, \
         num_fruit, num_salt, num_spicy] 
  return feat

X = [feature(d) for d in data]
y = [d['beer/ABV'] >= 6.5 for d in data]

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  res = 1.0 / (1 + exp(-x))
  return res

In [36]:
length = int(len(data)/3)

X_train = X[:length]
y_train = y[:length]

X_validation = X[length:2*length]
y_validation = y[length:2*length]

X_test = X[2*length:]
y_test = y[2*length:]

In [37]:
# Count for number of total data, y=0 and y=1
num_total = len(y_train)
num_y0 = y_train.count(0)
num_y1 = y_train.count(1)

In [38]:
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    if y[i]:
      loglikelihood -= log(1 + exp(-logit)) * num_total / (2 * num_y1)
    if not y[i]:
      loglikelihood -= (log(1 + exp(-logit)) + logit ) * num_total / (2 * num_y0)
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  # for debugging
  # print("ll =" + str(loglikelihood))
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      if y[i]:
        dl[k] += X[i][k] * (1 - sigmoid(logit)) * num_total / (2 * num_y1)
      if not y[i]:
        dl[k] -= X[i][k] * (1 - sigmoid(logit)) * num_total / (2 * num_y0)
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])

def train(lam):
  theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
  return theta

lam = 1.0
theta = train(lam)
print theta

[-0.11710314  0.00228235  0.01154569 -0.02494431 -0.00546974  0.3692712
  0.00291828 -0.04271636  0.17149857 -0.0011052   0.00371526]


In [39]:
X_data = [X_train, X_validation, X_test]
y_data = [y_train, y_validation, y_test]
symbol = ['train', 'valid', 'test']
print '位\tDataset\t\tTruePositive\tFalsePositive\tTrueNegative\tFalseNegative\tAccuracy\tBER'
for i in range(3):
    def TP(theta):
      scores = [inner(theta,x) for x in X_data[i]]
      predictions = [s > 0 for s in scores]
      correct = [((a==1) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
      tp = sum(correct) * 1.0
      return tp

    def TN(theta):
      scores = [inner(theta,x) for x in X_data[i]]
      predictions = [s > 0 for s in scores]
      correct = [((a==0) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
      tn = sum(correct) * 1.0
      return tn

    def FP(theta):
      scores = [inner(theta,x) for x in X_data[i]]
      predictions = [s > 0 for s in scores]
      correct = [((a==1) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
      fp = sum(correct) * 1.0
      return fp

    def FN(theta):
      scores = [inner(theta,x) for x in X_data[i]]
      predictions = [s > 0 for s in scores]
      correct = [((a==0) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
      fn = sum(correct) * 1.0
      return fn

    tp = TP(theta)
    fp = FP(theta)
    tn = TN(theta)
    fn = FN(theta)
    TPR = tp / (tp + fn)
    TNR = tn / (tn + fp)
    BER = 1 - 0.5 * (TPR + TNR)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    print str(lam)+'\t'+symbol[i]+'\t\t'+str(tp)+'\t\t'+str(fp)+'\t\t'+str(tn)+'\t\t'+str(fn)+'\t\t'+str(accuracy)+'\t'+str(BER)

位	Dataset		TruePositive	FalsePositive	TrueNegative	FalseNegative	Accuracy	BER
1.0	train		4380.0		2482.0		4836.0		4968.0		0.552982119285	0.435307141797
1.0	valid		7290.0		210.0		548.0		8618.0		0.470298811952	0.409392429955
1.0	test		2755.0		3817.0		6938.0		3158.0		0.581533477322	0.444491075971


In [40]:
# Original Algorithm
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  # for debugging
  # print("ll =" + str(loglikelihood))
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      dl[k] += X[i][k] * (1 - sigmoid(logit))
      if not y[i]:
        dl[k] -= X[i][k]
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])

def train(lam):
  theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
  return theta

lam = 1.0
theta = train(lam)

X_data = [X_train, X_validation, X_test]
y_data = [y_train, y_validation, y_test]
symbol = ['train', 'valid', 'test']
print '位\tDataset\t\tTruePositive\tFalsePositive\tTrueNegative\tFalseNegative\tAccuracy\tBER'
for i in range(3):
    def TP(theta):
      scores = [inner(theta,x) for x in X_data[i]]
      predictions = [s > 0 for s in scores]
      correct = [((a==1) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
      tp = sum(correct) * 1.0
      return tp

    def TN(theta):
      scores = [inner(theta,x) for x in X_data[i]]
      predictions = [s > 0 for s in scores]
      correct = [((a==0) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
      tn = sum(correct) * 1.0
      return tn

    def FP(theta):
      scores = [inner(theta,x) for x in X_data[i]]
      predictions = [s > 0 for s in scores]
      correct = [((a==1) and (b==0)) for (a,b) in zip(predictions,y_data[i])]
      fp = sum(correct) * 1.0
      return fp

    def FN(theta):
      scores = [inner(theta,x) for x in X_data[i]]
      predictions = [s > 0 for s in scores]
      correct = [((a==0) and (b==1)) for (a,b) in zip(predictions,y_data[i])]
      fn = sum(correct) * 1.0
      return fn

    tp = TP(theta)
    fp = FP(theta)
    tn = TN(theta)
    fn = FN(theta)
    TPR = tp / (tp + fn)
    TNR = tn / (tn + fp)
    BER = 1 - 0.5 * (TPR + TNR)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    print str(lam)+'\t'+symbol[i]+'\t\t'+str(tp)+'\t\t'+str(fp)+'\t\t'+str(tn)+'\t\t'+str(fn)+'\t\t'+str(accuracy)+'\t'+str(BER)

位	Dataset		TruePositive	FalsePositive	TrueNegative	FalseNegative	Accuracy	BER
1.0	train		9177.0		7128.0		190.0		171.0		0.562042481699	0.496164652477
1.0	valid		15746.0		744.0		14.0		162.0		0.945637825513	0.495856949226
1.0	test		5831.0		10545.0		210.0		82.0		0.36243100552	0.497170973537
