In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as utils
import getDataMedian

In [13]:
senLen =7
indexSize = None
embeddingSize = 128
kernelSizes = [3,4,5]
numFilters = 3
embeddings = None
XY = None
model = None
lossFn = nn.CrossEntropyLoss().type(torch.FloatTensor)
opt = None

In [14]:
def makeEmbeddings(numWords,index,X,senWords,embed):
    inputData = torch.zeros([len(X),1,senWords,embeddingSize],dtype=torch.float64)
    for i,sen in enumerate(X):
        senIndex = torch.zeros([1,senWords,embeddingSize],dtype=torch.float64)
        for j,word in enumerate(sen):
            senIndex[0,j]= embed[j-1]
        inputData[i][0] = Variable(torch.DoubleTensor(senIndex[0]))
    return inputData

def makeModel(lr):
    model = nn.ModuleList([nn.Conv2d(1,200,(3,embeddingSize)),nn.Conv2d(200,300,(4,embeddingSize),nn.Conv2d(300,500),(5,1))])
    '''model = nn.Sequential(
            nn.Conv2d(1,200,kernel_size=(3,embeddingSize),stride=1),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(200),
            nn.Dropout(p=0.2),
            nn.Conv2d(200,300,(4,1),stride=1),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(300),
            nn.Dropout(p=0.2),
            nn.Conv2d(300,500,(5,1),stride=1),
            nn.ReLU(inplace=True),
            nn.BatchNorm2d(500),
            nn.Dropout(p=0.2),
            nn.MaxPool2d((50,1)),
            Flatten(), 
            nn.Linear(500,2),
    )'''
    opt = optim.Adam(model.parameters(),lr=lr)
    return model,opt



def train(numEpochs=1):
    print("Model",model)
    bestAcc = 0
    for epoch in range(numEpochs):
        print("Epoch #",epoch)
        total_loss = 0
        model.train()
        for i,(x,y) in enumerate(XY):
            xVar = Variable(x).float()
            yVar = Variable(y).type(torch.LongTensor)
            scores = model(xVar)
            loss = lossFn(scores,yVar)
            if i == 0: 
                print("i = %d, loss = %.4f" % (i + 1, loss.data[0]))
            if (i + 1) % 100 == 0:
                print("i = %d, loss = %.4f " % (i + 1, loss.data[0]))
            opt.zero_grad()
            loss.backward()
            total_loss += loss.data[0] 
            opt.step()
        currAcc = checkAccuracy(XY)
        if currAcc > bestAcc: 
            bestAcc = currAcc
            save_checkpoint(model,'model.pt')
        
        print("total_loss %.4f"% total_loss/float(i))
def checkAccuracy(dataSet):
    #if(self.XY.dataset.train):
    #    print('Checking accuracy on validation set')
    #else:
    #    print('Checking accuracy on test set')
    numCorrect = 0
    numSamples = 0
    model.eval()
    for i,(x,y) in enumerate(dataSet):
        xVar = Variable(x).float()
        yVar = Variable(y).type(torch.LongTensor)
        scores = model(xVar)
        _,preds = scores.data.cpu().max(1)
        numCorrect += (preds == yVar).sum()
        numSamples += preds.size(0)
    acc = float(numCorrect)/numSamples
    return acc
    print('Got %d / %d correct (%.2f)' % (numCorrect, numSamples, 100 * acc))

class Flatten(nn.Module):
    def forward(self,x):
        N,C,H,W = x.size()
        return x.view(N,-1)

In [9]:
print('calling getData...')
X,XTe,word_to_idx,XY,XYTe,Y,Yte,med = getDataMedian.runNew()

calling getData...
7.0 med
16623


In [11]:
print('making embeddings...')
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


making embeddings...


In [None]:
embeddings = torch.randn(len(index)+1,128)
for k,v in index.items():
    try: 
        embeddings[v-1].copy_(torch.from_numpy(word_vectors.get_vector(k)))
    except KeyError:
        continue 
embeddings[0] = 0

In [17]:
inputData = makeEmbeddings(len(index),index,X,senLen,embeddings)
inputDataTe = makeEmbeddings(len(index),index,Xte,senLen,embeddings)

In [None]:
print(list(index.keys())[0])
print(list(indexTe.keys())[0])

In [18]:
print('making model...')
model,opt=makeModel(lr=1e-4)
print(type(inputData), type(Y))
my_datasetTr = utils.TensorDataset(inputData, Y)
my_datasetTe = utils.TensorDataset(inputDataTe, Yte)
my_dataloaderTr = utils.DataLoader(my_datasetTr,batch_size=64)
my_dataloaderTe = utils.DataLoader(my_datasetTe,batch_size=64)
XY = my_dataloaderTr
XYte = my_dataloaderTe

making model...
<class 'torch.Tensor'> <class 'torch.Tensor'>


In [25]:
def save_checkpoint(state, filename='checkpoint.pth.tar'):
    torch.save(state, filename)


In [27]:
print('calling train...')
train(5)

calling train...
Model Sequential(
  (0): Conv2d(1, 200, kernel_size=(3, 300), stride=(1, 1))
  (1): ReLU(inplace)
  (2): BatchNorm2d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): Dropout(p=0.2)
  (4): Conv2d(200, 300, kernel_size=(4, 1), stride=(1, 1))
  (5): ReLU(inplace)
  (6): BatchNorm2d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): Dropout(p=0.2)
  (8): Conv2d(300, 500, kernel_size=(5, 1), stride=(1, 1))
  (9): ReLU(inplace)
  (10): BatchNorm2d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): Dropout(p=0.2)
  (12): MaxPool2d(kernel_size=(50, 1), stride=(50, 1), padding=0, dilation=1, ceil_mode=False)
  (13): Flatten()
  (14): Linear(in_features=500, out_features=2, bias=True)
)
Epoch # 0




i = 1, loss = 1.2123




i = 100, loss = 0.5450 
i = 200, loss = 0.2993 
i = 300, loss = 0.6867 
i = 400, loss = 0.5023 
i = 500, loss = 0.3910 
i = 600, loss = 0.3225 
i = 700, loss = 0.5441 
i = 800, loss = 0.4703 
i = 900, loss = 0.7310 
i = 1000, loss = 0.7975 
i = 1100, loss = 0.5794 
i = 1200, loss = 1.6852 


  "type " + obj.__name__ + ". It won't be checked "


PicklingError: Can't pickle <class '__main__.Flatten'>: it's not the same object as __main__.Flatten

In [28]:
print('checking training accuracy...')
checkAccuracy(XY)


checking training accuracy...


0.4509037044736945

In [None]:
save_checkpoint(model,  'model.pt')

In [15]:
print('checking validation accuracy...')
checkAccuracy(my_dataloaderTe)

checking validation accuracy...
Got 912 / 1821 correct (50.08)


In [14]:
train(5)

Epoch # 0




i = 100, loss = 9.7361
i = 200, loss = 9.6903
i = 300, loss = 9.7456
i = 400, loss = 9.6905
i = 500, loss = 9.6392
i = 600, loss = 9.6411
i = 700, loss = 9.6473
i = 800, loss = 9.6686
i = 900, loss = 9.6868
i = 1000, loss = 9.7353
i = 1100, loss = 9.6816
i = 1200, loss = 9.5365
Epoch # 1
i = 100, loss = 9.6092
i = 200, loss = 9.5382
i = 300, loss = 9.6364
i = 400, loss = 9.6277
i = 500, loss = 9.5093
i = 600, loss = 9.5198
i = 700, loss = 9.6371
i = 800, loss = 9.4575
i = 900, loss = 9.5999
i = 1000, loss = 9.6197
i = 1100, loss = 9.5490
i = 1200, loss = 9.4070
Epoch # 2
i = 100, loss = 9.5561
i = 200, loss = 9.3913
i = 300, loss = 9.5359
i = 400, loss = 9.4710
i = 500, loss = 9.4327
i = 600, loss = 9.3857
i = 700, loss = 9.4832
i = 800, loss = 9.3978
i = 900, loss = 9.5204
i = 1000, loss = 9.5192
i = 1100, loss = 9.4498
i = 1200, loss = 9.2692
Epoch # 3
i = 100, loss = 9.3439
i = 200, loss = 9.2677
i = 300, loss = 9.4322
i = 400, loss = 9.3178
i = 500, loss = 9.2859
i = 600, loss = 9.

In [16]:
print('checking training accuracy...')
checkAccuracy(XY)
print('checking validation accuracy...')
checkAccuracy(my_dataloaderTe)

checking training accuracy...
Got 34702 / 76961 correct (45.09)
checking validation accuracy...
Got 912 / 1821 correct (50.08)


In [40]:
train(50)



i = 100, loss = 2.4784
starting new epoch
i = 100, loss = 1.9646
starting new epoch
i = 100, loss = 1.4863
starting new epoch
i = 100, loss = 2.7162
starting new epoch
i = 100, loss = 2.3571
starting new epoch
i = 100, loss = 2.9301
starting new epoch
i = 100, loss = 2.1042
starting new epoch
i = 100, loss = 1.2779
starting new epoch
i = 100, loss = 1.7552
starting new epoch
i = 100, loss = 0.8972
starting new epoch
i = 100, loss = 1.5916
starting new epoch
i = 100, loss = 1.3580
starting new epoch
i = 100, loss = 1.7536
starting new epoch
i = 100, loss = 1.9626
starting new epoch
i = 100, loss = 1.1654
starting new epoch
i = 100, loss = 2.0342
starting new epoch
i = 100, loss = 2.9617
starting new epoch
i = 100, loss = 1.3506
starting new epoch
i = 100, loss = 2.2686
starting new epoch
i = 100, loss = 2.3137
starting new epoch
i = 100, loss = 1.3706
starting new epoch
i = 100, loss = 1.8966
starting new epoch
i = 100, loss = 1.7058
starting new epoch
i = 100, loss = 1.8096
starting ne

In [41]:
print('checking training accuracy...')
checkAccuracy(XY)
print('checking validation accuracy...')
checkAccuracy(my_dataloaderTe)

checking training accuracy...
Got 4954 / 8000 correct (61.92)
checking validation accuracy...
Got 142 / 2662 correct (5.33)


In [32]:
checkAccuracy(XYte)

Got 0 / 1821 correct (0.00)


In [33]:
checkAccuracy(my_dataloaderTe)

Got 0 / 1821 correct (0.00)


In [None]:
checkAccuracy(XY)

Got 0 / 76961 correct (0.00)


In [None]:
checkAccuracy(my_dataloaderTr)