In [32]:
import torch
import re
import csv
import pandas as pd
import nltk
import random
import numpy as np

In [33]:
def clean(text):
    text = text.lower()
    text = re.sub(r"[^a-zäöüÄÖÜß ]", "", text)
    return text

In [34]:
tokenize = lambda x: x.split()

In [35]:
test_df = pd.read_csv('gluten_test_clean.csv', header=0, names=["raw", "label", "glutenfrei", "glutenhaltig", "clean"])

In [36]:
train_df = pd.read_csv('gluten_training_clean.csv', header=0, names=["raw", "label", "clean"])

In [37]:
len(train_df[train_df["label"] == "glutenfrei"])

5699

In [38]:
len(train_df[train_df["label"] == "glutenhaltig"])

3484

### Build dataset

In [39]:
from torchtext.data import Field, TabularDataset
from torchtext.vocab import Vectors

In [40]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [41]:
CLEAN = Field(sequential=True, use_vocab=True, tokenize=tokenize, batch_first = True)
LABEL = Field(sequential=False, use_vocab=True)

In [42]:
fields = {'clean': ('c', CLEAN), 'label': ('l', LABEL)}

In [43]:
train_data, test_data = TabularDataset.splits(
    path='./', 
    train='gluten_training_clean.csv',
    test='gluten_test_clean.csv',
    format='csv',
    fields=fields
)

In [124]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [45]:
vec = Vectors('GloVe_ge.txt', cache='./GloVe_vec/')

In [46]:
EMB_SIZE = len(vec['gluten'])

In [47]:
vec.dim

300

In [48]:
MAX_VOCAB_SIZE = 25000

CLEAN.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = vec, 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [50]:
CLEAN.vocab.__dict__.values

<function dict.values>

In [51]:
import torchtext

In [120]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [160]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [107]:
# class SimpleLSTMBaseline(nn.Module):
#     def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
#         super().__init__() # don't forget to call this!
#         self.embedding = nn.Embedding(len(CLEAN.vocab), emb_dim)
#         self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
#         self.linear_layers = []
#         for _ in range(num_linear - 1):
#             self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
#             self.linear_layers = nn.ModuleList(self.linear_layers)
#         self.predictor = nn.Linear(hidden_dim, 6)

#     def forward(self, seq):
#         hdn, _ = self.encoder(self.embedding(seq))
#         feature = hdn[-1, :, :]
#         for layer in self.linear_layers:
#             feature = layer(feature)
#             preds = self.predictor(feature)
#             return preds



In [108]:
em_sz = 100
nh = 500
nl = 3
model = SimpleLSTMBaseline(nh, emb_dim=em_sz)

In [161]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var)
#             print('xxx ----> ', x.dim())
            if self.y_vars is not None:
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
                print('yyy ----> ', y.dim())
            else:
                y = torch.zeros((1))

            yield (x, y)

    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, "c", "l")
# valid_dl = BatchWrapper(val_iter, "c", "l")
# test_dl = BatchWrapper(test_iter, "c", None)

In [162]:
BATCH_SIZE = 64
VOCAB = len(CLEAN.vocab)

In [167]:
model = CNN(VOCAB, EMB_SIZE, 128, [1, 2, 3], 1, 0.5)

In [168]:
import tqdm
import torch.optim as optim

In [169]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCELoss()
epochs = 6

In [170]:
for epoch in range(1, epochs):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        print('Y', y.shape)
        preds = model(x)
#         print(preds)
        print('Dim', preds.shape)

        loss = loss_func(y, preds)
        loss.backward()
        opt.step()

        running_loss += loss.data[0] * x.size(0)

    epoch_loss = running_loss / len(trn)

    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(y, preds)
        val_loss += loss.data[0] * x.size(0)

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

  0%|          | 0/101 [00:00<?, ?it/s]

yyy ---->  2
Y torch.Size([64, 1])
Dim torch.Size([64, 1])





RuntimeError: the derivative for 'target' is not implemented