In [8]:
import numpy as np
import math
from urllib.request import urlopen
import scipy.optimize
from scipy.spatial import distance
import random
from collections import defaultdict
import nltk
import string
from sklearn import linear_model
from sklearn import metrics
from nltk.stem.porter import *

In [9]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)

In [10]:
def parseDataFromURL(fname):
  for l in urlopen(fname):
    yield eval(l)

In [11]:
### Just the first 10000 reviews
print("Reading data...")
# http://cseweb.ucsd.edu/classes/fa19/cse258-a/data/beer_50000.json
data = list(parseData("/Users/t.z.cheng/Google_Drive/Coursework/CSE258/assignment/assignment1/train_Category.json"))[:10000]
print("done")

Reading data...
done


### Question 1

In [12]:
### How many unique unigrams and bigrams are there?
## Lower case and without punctuation
uniCount = defaultdict(int)
biCount = defaultdict(int)
punctuation = set(string.punctuation)

for d in data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        uniCount[w] += 1
    for bi in list(nltk.bigrams(r.split())):
        biCount[bi] += 1

In [13]:
### Alternative way to do it 
totalWords = 0
biCount = defaultdict(int)

for d in data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    ws = r.split()
#    ws = ws + [' '.join(x) for x in zip(ws[:-1],ws[1:])]
    ws = [' '.join(x) for x in zip(ws[:-1],ws[1:])]
    for w in ws:
        totalWords += 1
        biCount[w] += 1

In [14]:
bicounts = [(biCount[w], w) for w in biCount]
bicounts.sort()
bicounts.reverse()

In [15]:
print(len(bicounts),'bigrams')
print('Top five frequently-occuring bigrams',bicounts[:10])

256618 bigrams
Top five frequently-occuring bigrams [(4441, 'this game'), (4249, 'the game'), (3359, 'of the'), (2020, 'if you'), (2017, 'in the'), (1935, 'game is'), (1907, 'is a'), (1425, 'you can'), (1323, 'and the'), (1303, 'to the')]


### Question 2

In [16]:
nbigrams = 1000
bigrams = [x[1] for x in bicounts[:nbigrams]]

In [19]:
type(bigrams[1])

str

In [20]:
### Sentiment analysis
bigramId = dict(zip(bigrams, range(len(bigrams)))) # what is the rank of this word in the top 1000 words
bigramSet = set(bigrams)

def feature(datum):
    feat = [0]*len(bigrams) # create a one hot encoding for whether this word present or not
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for bi in list(nltk.bigrams(r.split())):
        if bi in bigrams:
            feat[bigramId[bi]] += 1
    feat.append(1) #offset
    return feat

feat_bi = [feature(d) for d in data]
y = [math.log2(d['hours']+1) for d in data] ### Transform hours 

In [21]:
# Regression
# theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(feat_bi, y)
theta = clf.coef_
predictions = clf.predict(feat_bi)
MSE = metrics.mean_squared_error(y, predictions)
print('MSE:', MSE)

MSE: 5.2424790309481235


In [22]:
weights = list(zip(theta, bigrams + ['offset_feature']))
weights.sort()
print(weights[-10:])
print(weights[:10])

[(0.0, 'you were'), (0.0, 'you will'), (0.0, 'you wont'), (0.0, 'you would'), (0.0, 'your character'), (0.0, 'your own'), (0.0, 'your time'), (0.0, 'your way'), (0.0, 'youre a'), (3.5988594999820545, 'offset_feature')]
[(0.0, '1010 would'), (0.0, '2 is'), (0.0, 'a bad'), (0.0, 'a better'), (0.0, 'a big'), (0.0, 'a bit'), (0.0, 'a blast'), (0.0, 'a bunch'), (0.0, 'a classic'), (0.0, 'a couple')]


In [26]:
np.shape(theta)

(1001,)

In [27]:
np.shape(feat_bi)

(10000, 1001)

In [25]:
bigrams[:5] +['offset_feature']

['this game', 'the game', 'of the', 'if you', 'in the', 'offset_feature']

### Question 3

In [None]:
unicounts = [(uniCount[w], w) for w in uniCount]
unicounts.sort()
unicounts.reverse()

In [None]:
nWords = 1000
words = [x[1] for x in unicounts[:nWords]]

In [None]:
### Sentiment analysis
wordId = dict(zip(words, range(len(words)))) # what is the rank of this word in the top 1000 words
wordSet = set(words)

def feature(datum):
  feat = [0]*len(words)
  r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
  for w in r.split():
    if w in words:
      feat[wordId[w]] += 1
  feat.append(1) #offset
  return feat

In [None]:
feat_uni = [feature(w) for w in data]

In [None]:
### Combine unigram and bigram features by sum
feat_unibi = np.array(feat_uni)+np.array(feat_bi) 

In [None]:
# Regression
# theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

# Regularized regression
clf.fit(feat_unibi, y)
theta = clf.coef_
predictions = clf.predict(feat_unibi)
MSE = metrics.mean_squared_error(y, predictions)
print('MSE:', MSE)

In [None]:
weights = list(zip(theta, words + ['offset_feature']))
weights.sort()
print(weights[-10:])
print(weights[:10])

### Question 4

In [None]:
word_list = ['destiny', 'annoying', 'likeable', 'chapter', 'interesting']
word_list.sort()
review_ID = 'r75487422'

In [None]:
### Find the index of the review_ID
idx = []
for d in np.arange(0,len(data)):
    if data[d]['reviewID'] == review_ID:
        idx = d

In [None]:
### Calculate IDF
docCount = defaultdict(int)
for d in data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    tokens = nltk.word_tokenize(r)
    for w in word_list:
        if w in tokens:
            docCount[w] += 1

In [None]:
N = len(data)
IDF = [(w,math.log10(N / docCount[w])) for w in docCount]
print('IDF:', IDF)

In [None]:
### Calculate TF
wordCount = defaultdict(int)
r = ''.join([c for c in data[idx]['text'].lower() if not c in punctuation])
tokens = nltk.word_tokenize(r)
for w in tokens:
    if w in word_list:
        wordCount[w] += 1
TF = [(w,wordCount[w]) for w in wordCount]

In [None]:
TF.sort()
print(TF)

In [None]:
IDF.sort()
print(IDF)

In [None]:
TFIDF = []
for w in np.arange(0,len(word_list)):
    TFIDF.append(TF[w][1]*IDF[w][1])

In [None]:
zip_iterator = zip(word_list, TFIDF)
print('TFDTF:',dict(zip_iterator))

### Question 5

In [None]:
### Calculate IDF and TF
word_idx_map = {w:i for i,w in enumerate(words)} 
docCount = defaultdict(int)
TF = []
for d in data:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    tokens = nltk.word_tokenize(r)
    new_array = [0]*len(words)
    for w in words: ## IDF
        if w in tokens:
            docCount[w] += 1
        else: docCount[w] += 0
    for w1 in tokens: ## TF
        if w1 in words:
            new_array[word_idx_map[w1]] += 1 
    TF.append(new_array)

In [None]:
N = len(data)
IDF = [math.log10(N / docCount[w]) if docCount[w] != 0 else 0 for w in docCount ]
print('IDF:', IDF[:10])

In [None]:
TFIDF = []
for d in np.arange(0,N):
    tmpTF = TF[d]
    tmpTFIDF = []
    for w in np.arange(0,len(IDF)):
        tmpTFIDF.append(tmpTF[w] * IDF[w])
    tmpTFIDF.append(1)
    TFIDF.append(tmpTFIDF)

In [None]:
# Regression
# Regularized regression
clf.fit(TFIDF, y)
theta = clf.coef_
predictions = clf.predict(TFIDF)
MSE = metrics.mean_squared_error(y, predictions)
print('MSE:', MSE)

### Question 6

In [None]:
TFIDF = []
for d in np.arange(0,N):
    tmpTF = TF[d]
    tmpTFIDF = []
    for w in np.arange(0,len(IDF)):
        tmpTFIDF.append(tmpTF[w] * IDF[w])
    TFIDF.append(tmpTFIDF)

In [None]:
r,c = np.shape(TFIDF)

In [None]:
### Calculate cosine similarity for each observation  
cos_sim = []
for nr in np.arange(0,r):
    cos = 1 - distance.cosine(TFIDF[idx],TFIDF[nr])
    if np.isnan(cos): 
        cos = np.nan_to_num(cos)
    cos_sim.append(cos)

In [None]:
data[0]

In [None]:
### Find the top similarity
top_cos_sim = np.argsort(cos_sim)[-2] ## -1 is ID r75487422 itself
print('Index', top_cos_sim)
print('cosine similarity', cos_sim[top_cos_sim])
print('ReviewID:', data[top_cos_sim]['reviewID'])
print('Text:', data[top_cos_sim]['text'])

### Question 7

In [None]:
### Shuffle the data to create train, test, validation sets
train = data.copy()
test = data.copy()
validation = data.copy()
np.random.shuffle(train)
np.random.shuffle(test)
np.random.shuffle(validation)

In [None]:
### Regularization terms
l = [0.01, 0.1, 1, 10, 100]

In [None]:
### Unigrams vs. bigrams: unigram
wordId = dict(zip(words, range(len(words)))) # what is the rank of this word in the top 1000 words
wordSet = set(words)

def feature(datum):
  feat = [0]*len(words)
  r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
  for w in r.split():
    if w in words:
      feat[wordId[w]] += 1
  feat.append(1) #offset
  return feat
train_uni = [feature(w) for w in train]
test_uni = [feature(w) for w in test]
valid_uni = [feature(w) for w in validation]

In [None]:
### Unigrams vs. bigrams: bigram
bigramId = dict(zip(bigrams, range(len(bigrams)))) # what is the rank of this word in the top 1000 words
bigramSet = set(bigrams)

def feature(datum):
    feat = [0]*len(bigrams) # create a one hot encoding for whether this word present or not
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for bi in list(nltk.bigrams(r.split())):
        if bi in bigrams:
            feat[bigramId[bi]] += 1
    feat.append(1) #offset
    return feat
train_bi = [feature(w) for w in train]
test_bi = [feature(w) for w in test]
valid_bi = [feature(w) for w in validation]

In [None]:
train_y = [math.log2(d['hours']+1) for d in train] ### Transform hours 
test_y = [math.log2(d['hours']+1) for d in test] ### Transform hours 
valid_y = [math.log2(d['hours']+1) for d in validation]

In [None]:
### Regularized regression of unigram
for nl in l:
    clf = linear_model.Ridge(nl, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(train_uni, train_y)
    theta = clf.coef_
    predictions = clf.predict(valid_uni)
    MSE = metrics.mean_squared_error(valid_y, predictions)
    print('Unigram','Regularization term:', nl, 'MSE:', MSE)

In [None]:
### MSE of the testset
clf = linear_model.Ridge(0.01, fit_intercept=False) # MSE + 1.0 l2
clf.fit(train_uni, train_y)
theta = clf.coef_
predictions = clf.predict(test_uni)
MSE = metrics.mean_squared_error(test_y, predictions)
print('testMSE:', MSE)

In [None]:
### Regularized regression of bigram
for nl in l:
    clf = linear_model.Ridge(nl, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(train_bi, train_y)
    theta = clf.coef_
    predictions = clf.predict(valid_bi)
    MSE = metrics.mean_squared_error(valid_y, predictions)
    print('Bigram','Regularization term:', nl, 'MSE:', MSE)

In [None]:
### MSE of the testset
clf = linear_model.Ridge(0.01, fit_intercept=False) # MSE + 1.0 l2
clf.fit(train_bi, train_y)
theta = clf.coef_
predictions = clf.predict(test_bi)
MSE = metrics.mean_squared_error(test_y, predictions)
print('testMSE:', MSE)

In [None]:
### Removing punctuation vs. preserving it: removing it
wordId = dict(zip(words, range(len(words)))) # what is the rank of this word in the top 1000 words
wordSet = set(words)

def feature(datum):
  feat = [0]*len(words)
  r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
  for w in r.split():
    if w in words:
      feat[wordId[w]] += 1
  feat.append(1) #offset
  return feat
train_nopunc = [feature(w) for w in train]
test_nopunc = [feature(w) for w in test]
valid_nopunc = [feature(w) for w in validation]

In [None]:
### Removing punctuation vs. preserving it: preserving it
wordId = dict(zip(words, range(len(words)))) # what is the rank of this word in the top 1000 words
wordSet = set(words)

def feature(datum):
  feat = [0]*len(words)
  r = ''.join([c for c in datum['text'].lower()])
  for w in r.split():
    if w in words:
      feat[wordId[w]] += 1
  feat.append(1) #offset
  return feat
train_punc = [feature(w) for w in train]
test_punc = [feature(w) for w in test]
valid_punc = [feature(w) for w in validation]

In [None]:
### Regularized regression of removing punctuation
for nl in l:
    clf = linear_model.Ridge(nl, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(train_nopunc, train_y)
    theta = clf.coef_
    predictions = clf.predict(valid_nopunc)
    MSE = metrics.mean_squared_error(valid_y, predictions)
    print('Removing punctuation,','Regularization term:', nl, 'MSE:', MSE)

In [None]:
### MSE of the testset
clf = linear_model.Ridge(0.01, fit_intercept=False) # MSE + 1.0 l2
clf.fit(train_nopunc, train_y)
theta = clf.coef_
predictions = clf.predict(test_nopunc)
MSE = metrics.mean_squared_error(test_y, predictions)
print('testMSE:', MSE)

In [None]:
### Regularized regression of preserving punctuation
for nl in l:
    clf = linear_model.Ridge(nl, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(train_punc, train_y)
    theta = clf.coef_
    predictions = clf.predict(valid_punc)
    MSE = metrics.mean_squared_error(valid_y, predictions)
    print('Preserving punctuation,','Regularization term:', nl, 'MSE:', MSE)

In [None]:
### MSE of the testset
clf = linear_model.Ridge(0.01, fit_intercept=False) # MSE + 1.0 l2
clf.fit(train_punc, train_y)
theta = clf.coef_
predictions = clf.predict(test_punc)
MSE = metrics.mean_squared_error(test_y, predictions)
print('testMSE:', MSE)

In [None]:
### tfidf scores vs. word counts: word counts 
wordId = dict(zip(words, range(len(words)))) # what is the rank of this word in the top 1000 words
wordSet = set(words)

def feature(datum):
  feat = [0]*len(words)
  r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
  for w in r.split():
    if w in words:
      feat[wordId[w]] += 1
  feat.append(1) #offset
  return feat
train_tf = [feature(w) for w in train]
test_tf = [feature(w) for w in test]
valid_tf = [feature(w) for w in validation]

In [None]:
### tfidf scores vs. word counts: tfidf scores
### Calculate IDF and TF

def feature(datum):
    word_idx_map = {w:i for i,w in enumerate(words)} 
    docCount = defaultdict(int)
    TF = []
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    tokens = nltk.word_tokenize(r)
    new_array = [0]*len(words)
    for w in words: ## IDF
        if w in tokens:
            docCount[w] += 1
        else: docCount[w] += 0
    for w1 in tokens: ## TF
        if w1 in words:
            new_array[word_idx_map[w1]] += 1 
    TF.append(new_array)
    IDF = [math.log10(N / docCount[w]) if docCount[w] != 0 else 0 for w in docCount]
    for n in np.arange(0,len(IDF)):
        TFIDF.append(TF[0][n]*IDF[n])
    return TFIDF

train_tfidf = [feature(w) for w in train]
test_tfidf = [feature(w) for w in test]
valid_tfidf = [feature(w) for w in validation]

In [None]:
### Regularized regression of word count
for nl in l:
    clf = linear_model.Ridge(nl, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(train_tf, train_y)
    theta = clf.coef_
    predictions = clf.predict(valid_tf)
    MSE = metrics.mean_squared_error(valid_y, predictions)
    print('Word count,','Regularization term:', nl, 'MSE:', MSE)

In [None]:
### MSE of the testset
clf = linear_model.Ridge(0.01, fit_intercept=False) # MSE + 1.0 l2
clf.fit(train_tf, train_y)
theta = clf.coef_
predictions = clf.predict(test_tf)
MSE = metrics.mean_squared_error(test_y, predictions)
print('testMSE:', MSE)

In [None]:
### Regularized regression of TFIDF
for nl in l:
    clf = linear_model.Ridge(nl, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(train_tfidf, train_y)
    theta = clf.coef_
    predictions = clf.predict(valid_tfidf)
    MSE = metrics.mean_squared_error(valid_y, predictions)
    print('TFIDF,','Regularization term:', nl, 'MSE:', MSE)

In [None]:
### MSE of the testset
clf = linear_model.Ridge(0.01, fit_intercept=False) # MSE + 1.0 l2
clf.fit(train_tfidf, train_y)
theta = clf.coef_
predictions = clf.predict(test_tfidf)
MSE = metrics.mean_squared_error(test_y, predictions)
print('testMSE:', MSE)