In [14]:
from collections import defaultdict
from __future__ import division
import sys
import time
import numpy as np
import pandas as pd

class svector(defaultdict):

    def __init__(self, old=None):
        if old is not None:
            defaultdict.__init__(self, float, old)
        else:
            defaultdict.__init__(self, float)

    def __iadd__(self, other): # a += b
        for k, v in other.items():
            self[k] += v
        return self

    def __add__(self, other): # a + b
        new = svector()
        for k, v in self.items():
            new[k] = v
        for k, v in other.items():
            new[k] += v
        return new

    def __sub__(self, other): # a - b
        return self + (-1) * other

    def __isub__(self, other): # a -= b
        self += (-1) * other

    def __mul__(self, c): # v * c where c is scalar
        new = svector()
        for k, v in self.items():
            new[k] = v * c
        return new

    __rmul__ = __mul__ # c * v where c is scalar

    def dot(self, other): # dot product
        a, b = (self, other) if len(self) < len(other) else (other, self) # fast
        return sum(v * b[k] for k, v in a.items())

    def __neg__(self): # -a
        new = svector()
        for k, v in self.items():
            new[k] = -v
        return new

    def copy(self):
        return svector(self)
        



def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label=="+" else -1, words.split())

def make_vector(words, prune = []):
    v = svector()
    for word in words:
        if word not in prune:
            v[word] += 1
    v['bias'] = 1
    return v
    
def test(devfile, model):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1): # note 1...|D|
        err += label * (model.dot(make_vector(words))) <= 0
    return err/i  # i is |D| now
            
def train(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.
    model = svector()
    model_a = svector()
    c = 0
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            sent = make_vector(words)
            
            c += 1
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                model_a += c * label * sent
        dev_err = test(devfile, c * model - model_a)
        #dev_err = test(devfile, model)
        best_err = min(best_err, dev_err)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    return model_a

trainpath = 'hw4-data/train.txt'
devpath = 'hw4-data/dev.txt'
testpath = 'hw4-data/test.txt'


weight_avg = train(trainpath, devpath, 14)

epoch 1, update 39.0%, dev 31.4%
epoch 2, update 25.5%, dev 27.7%
epoch 3, update 20.8%, dev 27.2%
epoch 4, update 17.2%, dev 27.6%
epoch 5, update 14.1%, dev 27.2%
epoch 6, update 12.2%, dev 26.7%
epoch 7, update 10.5%, dev 26.3%
epoch 8, update 9.6%, dev 26.5%
epoch 9, update 8.5%, dev 26.2%
epoch 10, update 7.6%, dev 26.4%
epoch 11, update 6.2%, dev 26.8%
epoch 12, update 5.8%, dev 26.6%
epoch 13, update 6.0%, dev 26.4%
epoch 14, update 4.6%, dev 26.1%
best dev err 26.1%, |w|=15805, time: 2.2 secs


In [2]:
#top and bottom weights
weight_dict = dict(weight_avg)
weight_dict_sort = {k: v for k, v in sorted(weight_dict.items(), key=lambda item: item[1])}
text = open('weight_txt.csv', 'w')
text.write(str(weight_dict_sort))
text.close()



PermissionError: [Errno 13] Permission denied: 'weight_txt.csv'

In [3]:
from collections import Counter
def pruned(fileLoc, prune_num = 2):
    prune_list = []
    words_complete = []
    for label, words in read_from(fileLoc):
        words_complete.extend(words)
    word_counts = Counter(words_complete)
    
    for i in word_counts:
        if word_counts[i] <= prune_num:
            prune_list.append(i)
    #print(words)
    return set(prune_list)

prune_set = pruned(trainpath)

#weight_avg


def prune_train(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.
    model = svector()
    model_a = svector()
    c = 0
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            sent = make_vector(words, prune_set)
            c += 1
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                model_a += c * label * sent
        dev_err = test(devfile, c * model - model_a)
        #dev_err = test(devfile, model)
        best_err = min(best_err, dev_err)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    return model_a

weight_avg_prune = prune_train(trainpath, devpath, 10)

epoch 1, update 38.9%, dev 31.1%
epoch 2, update 28.2%, dev 29.2%
epoch 3, update 23.7%, dev 28.7%
epoch 4, update 21.7%, dev 28.0%
epoch 5, update 18.5%, dev 27.9%
epoch 6, update 17.7%, dev 26.6%
epoch 7, update 16.2%, dev 26.8%
epoch 8, update 15.2%, dev 26.6%
epoch 9, update 14.1%, dev 26.6%
epoch 10, update 12.6%, dev 26.6%
best dev err 26.6%, |w|=5934, time: 1.5 secs


In [4]:
print('pruned word count:', len(weight_avg_prune))
print('Non-pruned word count:', len(weight_avg))

pruned word count: 5865
Non-pruned word count: 13428


In [5]:
# Cashing the read_from line - 2.5
def cashed_train(trainpath, devpath, epochs=5):
    t = time.time()
    train = cashed_read_from(trainpath)
    dev = cashed_read_from(devpath)
    best_err = 1.
    model = svector()
    model_a = svector()
    c = 0
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(train, 1): # label is +1 or -1
            sent = make_vector(words)
            
            c += 1
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                model_a += c * label * sent
        dev_err = cashed_test(dev, c * model - model_a)
        best_err = min(best_err, dev_err)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    return model_a

def cashed_test(dev, model):
    tot, err = 0, 0
    for k, (label, words) in enumerate(dev, 1): # note 1...|D|
        #print(k)
        err += label * (model.dot(make_vector(words))) <= 0
        f = k
    print(f) # NOT ACCESSIBLE OUTSIDE FOR LOOP??
    return err/k  # i is |D| now

def cashed_read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label=="+" else -1, words.split())


weight_avg = cashed_train(trainpath, devpath, 5)

1000
epoch 1, update 39.0%, dev 31.4%


UnboundLocalError: local variable 'f' referenced before assignment

# 4: Try some other learning algs with sklearn

In [33]:
# TRAIN MODEL

def binarize(path, mapping, first = False):
            
    df = open(path).readlines()
    lines = [line.strip().split('\t') for line in df]
    for i in lines:
        if i[0] == '-':
            i[0] = -1
        else: 
            i[0] = 1

    target = []
    words = []


    for i in lines:
        target.append(i[0])
        words.append(i[1].split(' '))
    

    new_data = []

    for row in words:
        new_row = []
        for x in row:
            feature = x
            if feature not in mapping and first == False:
                continue
            elif feature not in mapping: 
                mapping[feature] = len(mapping)
            new_row.append(mapping[feature])
        new_data.append(new_row)

    num_features = len(mapping)
    num_data = len(new_data)

    bin_model = np.zeros((num_data, num_features))

    for idx, row in enumerate(new_data):
        for jdx in row:
            bin_model[idx][jdx] += 1

    return bin_model, target, mapping





            


In [34]:
# TRAIN
from sklearn.linear_model import LogisticRegression 



time1 = time.time()

mapping = {}
x_train, y_train, mapping = binarize(trainpath, mapping, first = True)
x_dev, y_dev, mapping = binarize(devpath, mapping)



lr = LogisticRegression(max_iter = 9000)
lr.fit(x_train, y_train)

y_predict = list(lr.predict(x_dev))

err,pos = 0,0
for i, prediction in enumerate(y_predict):
    if (prediction < 0) and (y_dev[i] < 0): # correct negative
        pass
    elif (prediction > 0) and (y_dev[i] > 0): # correct positive
        pass
    else: # incorrect
        err += 1
    if prediction > 0:
        pos += 1
        
score = err /1000 


time2 = time.time()

print("best dev err %.1f%%, |w|=%d, time: %.1f secs, with %.0f positives" % (score * 100, len(mapping), time2-time1, pos))






best dev err 25.6%, |w|=15805, time: 18.6 secs, with 500 positives


In [36]:
# TEST ON BLIND DATA AND OUTPUT DATA FILE
x_test, y_test, mapping = binarize(testpath, mapping)

y_predict = list(lr.predict(x_test))


with open('test.txt.predicted.txt', 'w') as file:
    for i,row in enumerate(y_predict):
        if i + 1 < len(y_predict):
            if row == 1:
                file.write('+' + '\n')
            else:
                file.write('-' + '\n')
        else:
            if row == 1:
                file.write('+' + '\n')
            else:
                file.write('-' + '\n')
file.close()



In [21]:
row

1

# 5 Deployment

In [7]:
# Remove stop words
import nltk
nltk.download('punkt')
from nltk.corpus import *
nltk.download('stopwords')
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tanku\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tanku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:


def train_drop_stop(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.
    model = svector()
    model_a = svector()
    c = 0
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            text_tokens = word_tokenize(" ".join(words))

            clean_words = [word for word in text_tokens if not word in stopwords.words()]            
            
            sent = make_vector(clean_words)
            
            c += 1
            if label * (model.dot(sent)) <= 0:
                updates += 1
                model += label * sent
                model_a += c * label * sent
        dev_err = test(devfile, c * model - model_a)
        #dev_err = test(devfile, model)
        best_err = min(best_err, dev_err)
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    return model_a

print('test')
weight_avg = train_drop_stop(trainpath, devpath, 5)



#epoch 1, update 39.3%, dev 31.0%
#epoch 2, update 23.0%, dev 27.7%
#epoch 3, update 16.8%, dev 27.2%
#epoch 4, update 12.9%, dev 26.9%
#epoch 5, update 10.2%, dev 27.1%
# drop stop length 12,304

test
epoch 1, update 39.3%, dev 31.0%
epoch 2, update 23.0%, dev 27.7%
epoch 3, update 16.8%, dev 27.2%
epoch 4, update 12.9%, dev 26.9%
epoch 5, update 10.2%, dev 27.1%
best dev err 26.9%, |w|=15487, time: 91017.2 secs


In [30]:
weight_avg

svector(float,
        {"'s": 241946.0,
         'tour': 43467.0,
         'force': -1869.0,
         ',': 27157.0,
         'written': -9042.0,
         'directed': 56931.0,
         'quietly': 15094.0,
         'implosion': 1.0,
         'rather': 218.0,
         'explosion': -8781.0,
         'fear': -3639.0,
         'bias': -58063.0,
         'places': 74622.0,
         'slightly': 25584.0,
         'believable': 53120.0,
         'love': 2514.0,
         'triangle': -15119.0,
         'difficult': -2247.0,
         'swallow': -30300.0,
         'setting': 16289.0,
         'disappointingly': -28615.0,
         'moves': 20070.0,
         'story': -81548.0,
         'realm': -41421.0,
         'improbable': 32881.0,
         'thriller': -51192.0,
         'indulgent': 14081.0,
         'sunbaked': 24006.0,
         'summery': 24006.0,
         'mind': -22449.0,
         'sex': 23991.0,
         'lucia': 24006.0,
         'may': -70792.0,
         'well': 72800.0,
         'prove': 