In [38]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as utils
import getData
torch.manual_seed(1)

class TextCNN():
    def __init__(self, senLen, numClasses, indexSize,
                embeddingSize, kernelSizes, numFilters,XY):
        self.senLen = senLen
        self.numClasses = numClasses
        self.indexSize = indexSize
        self.embeddingSize = embeddingSize
        self.kernelSizes = kernelSizes
        self.numFilters = numFilters
        self.embeddings = None
        self.XY = XY
        self.model = None
        self.lossFn = nn.CrossEntropyLoss().type(torch.FloatTensor)
        self.opt = None

    def makeEmbeddings(self,numWords,index,X,senWords):
        e = nn.Embedding(numWords,self.embeddingSize,sparse=True)
        inputData = torch.zeros([len(X),1,senWords,self.embeddingSize],dtype=torch.float64)
        for i,sen in enumerate(X):
            senIndex = torch.zeros([1,senWords],dtype=torch.long)
            for j,word in enumerate(sen.split()):
                senIndex[0,j]= index[word]
            inputData[i][0] = e(Variable(torch.LongTensor(senIndex[0])))
        return inputData

    def makeModel(self,lr):
        self.model = nn.Sequential(
                nn.Conv2d(1,64,kernel_size=(3,self.embeddingSize),stride=1),
                nn.ReLU(inplace=True),
                nn.BatchNorm2d(64),
                nn.Dropout(p=0.2),
                nn.Conv2d(64,64,(3,1),stride=1),
                nn.ReLU(inplace=True),
                nn.BatchNorm2d(64),
                nn.Dropout(p=0.2),
                nn.MaxPool2d(kernel_size=(2,1)),
                nn.Conv2d(64,128,(3,1),stride=1),
                nn.ReLU(inplace=True),
                nn.BatchNorm2d(128),
                nn.Dropout(p=0.2),
                nn.Conv2d(128,128,(3,1),stride=1),
                nn.ReLU(inplace=True),
                nn.BatchNorm2d(128),
                nn.Dropout(p=0.2),
                nn.MaxPool2d(kernel_size=(2,1)),
                Flatten(),
                nn.Linear(5248,1024),
                nn.ReLU(inplace=True),
                nn.Linear(1024,2),
        )
        self.opt = optim.RMSprop(self.model.parameters(),lr=lr)

    def train(self,numEpochs=1):
        for epoch in range(numEpochs):
            self.model.train()
            print("LEN XY",len(self.XY))
            for i,(x,y) in enumerate(self.XY):
                xVar = Variable(x).float()
                yVar = Variable(y).type(torch.LongTensor)
                scores = self.model(xVar)
                loss = self.lossFn(scores,yVar)
                if (i + 1) % 100 == 0:
                    print("i = %d, loss = %.4f" % (i + 1, loss.data[0]))
                self.opt.zero_grad()
                loss.backward()
                self.opt.step()
            print('starting new epoch')

    def setXY(self,XY):
        self.XY=XY
    def checkAccuracy(self):
        #if(self.XY.dataset.train):
        #    print('Checking accuracy on validation set')
        #else:
        #    print('Checking accuracy on test set')
        numCorrect = 0
        numSamples = 0
        self.model.eval()
        for i,(x,y) in enumerate(self.XY):
            xVar = Variable(x).float()
            yVar = Variable(y).type(torch.LongTensor)
            scores = self.model(xVar)
            _,preds = scores.data.cpu().max(1)
            numCorrect += (preds == yVar).sum()
            numSamples += preds.size(0)
        acc = float(numCorrect)/numSamples
        print('Got %d / %d correct (%.2f)' % (numCorrect, numSamples, 100 * acc))

class Flatten(nn.Module):
    def forward(self,x):
        N,C,H,W = x.size()
        return x.view(N,-1)


In [39]:
print('calling getData...')
X,index,XY,Y = getData.main()
print('making TextCNN...')
tNN = TextCNN(177,2,40690,10,[3,4,5],3,XY)
print('making embeddings...')
inputData = tNN.makeEmbeddings(40690,index,X,177)


calling getData...
making TextCNN...
making embeddings...


In [40]:
print('making model...')
tNN.makeModel(lr=1e-4)
print(type(inputData), type(Y))
my_dataset = utils.TensorDataset(inputData, Y)
print("Y LEN",len(Y))
my_dataloader = utils.DataLoader(my_dataset,batch_size=64)
print("dataloader len:",len(my_dataloader))
tNN.setXY(my_dataloader)

making model...
<class 'torch.Tensor'> <class 'torch.Tensor'>
Y LEN 64720
dataloader len: 1012


In [41]:
print('calling train...')
tNN.train(1)

calling train...
LEN XY 1012




i = 100, loss = 0.7191
i = 200, loss = 0.7447
i = 300, loss = 0.7010
i = 400, loss = 0.6755
i = 500, loss = 0.6962
i = 600, loss = 0.6630
i = 700, loss = 0.6967
i = 800, loss = 0.6790
i = 900, loss = 0.6872
i = 1000, loss = 0.6841
starting new epoch


In [42]:
print('checking accuracy...')
tNN.checkAccuracy()

checking accuracy...
Got 34223 / 64720 correct (52.88)
