In [1]:
import re
import random
from math import exp, log
from datetime import datetime
from operator import itemgetter

In [2]:
def clean(s):
    return ' '.join(re.findall(r'\w+',s,flags = re.UNICODE)).lower()

In [3]:
def get_data_tsv(dataset, opts):
    for e, line in enumerate(open(dataset,'rb')):
        if e>0:
            r=line.decode('utf-8').strip().split('\t')
            id = r[0]
            
            if opts['clean']:
                try:
                    r[2] = clean(r[2])
                except:
                    r[1] = clean(r[1])
            if len(r) == 3: #train set
                features = [(hash(f)%opts['D'],1) for f in r[2].split()]
                label = int(r[1])
            else: #test set
                features = [(hash(f)%opts['D'],1) for f in r[1].split()]
                label = 1
                
            if opts['2grams']:
                for i in range(len(features)-1):
                    features.append((hash(str(features[i][0]) + str(features[i+1][0])) % opts['D'], 1))
            yield label, id, features

In [4]:
def dot_product(features, weights):
    dotp = 0
    for f in features:
        dotp += weights[f[0]] * f[1]
    return dotp

In [5]:
def train_tron(dataset, opts):
    start = datetime.now()
    print("\nPass\t\tError\t\tAverage\t\tNr.Samples\tSince Start")
    
    if opts["random_init"]:
        random.seed(3003)
        weights = [random.random()] * opts['D']
    else:
        weights = [0.] * opts['D']
    
    for pass_nr in range(opts['n_passes']):
        error_counter=0
        for e, (label, id, features) in enumerate( get_data_tsv(dataset, opts)):
            dp = dot_product(features, weights) > 0.5
            error = label - dp
            if error != 0:
                error_counter += 1
                for index, value in features:
                    weights[index] += opts['learning_rate'] * error * log(1.+value)
        
        print("%s\t\t%s\t\t%s\t\t%s\t\t%s" % (\
                                             pass_nr+1,
                                             error_counter,
                                             round(1-error_counter / float(e+1),5),
                                             e+1, datetime.now()-start))
        if error_counter == 0 or error_counter < opts['errors_satisfied']:
            print('%s errors found during training, halting' % error_counter)
            break
    return weights

In [6]:
def test_tron(dataset, weifgrs, opts):
    start = datetime.now()
    print("\nTesting online\nErrors\tAverage\t\tNr. Samples\tSince Start")
    preds=[]
    error_counter=0
    for e, (label, id, features) in enumerate(get_data_tsv(dataset,opts)):
        dotp = dot_product(features, weights)
        dp = dotp > 0.5
        if dp > 0.5:
            preds.append([id,1,dotp])
        else:
            preds.append([id,0,dotp])
        if label - dp != 0:
            error_counter += 1
            
    print('%s\t\t%s\t\t%s\t\t%s' % (\
                                   error_counter,
                                   round(1-error_counter / float(e+1),5),
                                   e+1,
                                   datetime.now()-start))
    
    max_dotp = max(preds, key = itemgetter(2))[2]
    min_dotp = min(preds, key = itemgetter(2))[2]
    for p in preds:
        p.append((p[2]-min_dotp)/float(max_dotp-min_dotp))
    print("Done testing in %s" % str(datetime.now()-start))
    return preds

In [7]:
opts = {}
opts["D"] = 2 ** 25
opts["learning_rate"] = 0.1
opts["n_passes"] = 80 # Maximum number of passes to run before halting
opts["errors_satisfied"] = 0 # Halt when training errors < errors_satisfied
opts["random_init"] = False # set random weights, else set all 0
opts["clean"] = True # clean the text a little
opts["2grams"] = True # add 2grams

#training and saving model into weights
%time weights = train_tron("./data/labeledTrainData.tsv",opts)


Pass		Error		Average		Nr.Samples	Since Start
1		5676		0.77296		25000		0:00:13.409250
2		3132		0.87472		25000		0:00:25.795160
3		2195		0.9122		25000		0:00:38.096866
4		1716		0.93136		25000		0:00:50.441805
5		1199		0.95204		25000		0:01:02.441543
6		916		0.96336		25000		0:01:14.485466
7		856		0.96576		25000		0:01:26.523127
8		748		0.97008		25000		0:01:38.494036
9		415		0.9834		25000		0:01:50.359182
10		495		0.9802		25000		0:02:02.202769
11		361		0.98556		25000		0:02:14.181504
12		359		0.98564		25000		0:02:25.988393
13		322		0.98712		25000		0:02:37.798253
14		246		0.99016		25000		0:02:49.602687
15		199		0.99204		25000		0:03:01.381594
16		137		0.99452		25000		0:03:13.172701
17		184		0.99264		25000		0:03:25.029706
18		126		0.99496		25000		0:03:36.806029
19		79		0.99684		25000		0:03:48.639504
20		75		0.997		25000		0:04:00.593168
21		73		0.99708		25000		0:04:12.846513
22		48		0.99808		25000		0:04:24.978029
23		83		0.99668		25000		0:04:36.800725
24		71		0.99716		25000		0:04:48.905287
25		59		0

In [8]:
%time preds = test_tron("./data/testData.tsv",weights,opts)


Testing online
Errors	Average		Nr. Samples	Since Start
12859		0.48564		25000		0:00:12.385107
Done testing in 0:00:12.423152
Wall time: 12.4 s


In [9]:
with open("./data/submit_perceptron.csv","wb") as outfile:
    outfile.write('"id","sentiment"\n'.encode('utf-8'))
    for p in sorted(preds):
        outfile.write("{},{}\n".format(p[0],p[3]).encode('utf-8'))