## Glove Baseline

In [96]:
# Extract all of the words from our training set and count frequencies
word_counts = defaultdict(int)
for post in train_dataset_df.selftext.iteritems():
    text = post[1].strip().lower()
    words = re.findall(r"[\w']+|[.,!?;]", text)
    for word in words:
        word_counts[word] += 1
len(word_counts)

10223

In [97]:
# Create token mapping
word_to_token = dict()
token_to_word = dict()
word_to_token['<UNK>'] = 0
token_to_word[0] = '<UNK>'
i = 1
for word, count in word_counts.items():
    if count < 7:
        continue
    word_to_token[word] = i
    token_to_word[i] = word
    i += 1
len(word_to_token)

2349

In [166]:
# Tokenize and detokenize
def tokenize_post(post):
    text = post.strip().lower()
    words = re.findall(r"[\w']+|[.,!?;]", text)
    output = []
    for word in words:
        output.append(word_to_token.get(word, 0))
    return torch.eye(len(word_to_token))[np.array(output)].sum(axis=0)     

In [208]:
# Tokenize reviews in train dataset
train_dataset_df['tokenized_selftext'] = train_dataset_df.selftext.apply(tokenize_post)
test_dataset_df['tokenized_selftext'] = test_dataset_df.selftext.apply(tokenize_post)

## Simple Feed Forward No Embeddings

In [200]:
class Feedforward(torch.nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(Feedforward, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.output_size = output_size
            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.relu = torch.nn.ReLU()
            self.fc2 = torch.nn.Linear(self.hidden_size, self.output_size)
            
        def forward(self, x):
            hidden = self.fc1(x)
            relu = self.relu(hidden)
            output = self.fc2(relu)
            return output

In [238]:
def build_features_and_ys(df, features):
    feature_df = df[features]
    xs = feature_df.apply(
        lambda x : np.hstack([np.array(a) for a in x]), axis=1
    ).tolist()
    label_index = sorted(train_dataset_df.label.unique())
    ys = np.array(list(map(label_index.index, df.label.to_list())))
    return torch.Tensor(xs), torch.LongTensor(ys)

def train_model(model, xs, ys, epochs=10, batch_size=10):
    device = torch.device('cpu')
    model.to(device)
    opt = optim.Adam(model.parameters(), lr=0.001)
    dataset = torch_data.TensorDataset(xs, ys)
    loader = torch_data.DataLoader(dataset, 
               batch_size=batch_size,
               shuffle=True)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        epoch_loss = 0
        for context, label in loader:
            opt.zero_grad()
            # Get predictions
            outputs = model(context)
            # Calculate loss
            loss = loss_fn(outputs, label)
            loss.backward()
            opt.step()
            epoch_loss += loss
        print(f"EPOCH {epoch} LOSS = {epoch_loss}")
    return model

In [239]:
model = Feedforward(len(word_to_token), 512, len(train_dataset_df.label.unique()))
xs, ys = build_features_and_ys(train_dataset_df, ['tokenized_selftext'])
trained_model = train_model(model, xs, ys)

EPOCH 0 LOSS = 94.05351257324219
EPOCH 1 LOSS = 62.314884185791016
EPOCH 2 LOSS = 46.70051574707031
EPOCH 3 LOSS = 34.05880355834961
EPOCH 4 LOSS = 25.592744827270508
EPOCH 5 LOSS = 26.413055419921875
EPOCH 6 LOSS = 19.499250411987305
EPOCH 7 LOSS = 18.350688934326172
EPOCH 8 LOSS = 17.992080688476562
EPOCH 9 LOSS = 17.615934371948242


## Validate

In [240]:
def get_model_accuracy(model, xs, ys):
    print(np.mean((model(xs).argmax(axis=1) == ys).numpy()))

In [241]:
test_xs, test_ys = build_features_and_ys(test_dataset_df, ['tokenized_selftext'])
get_model_accuracy(trained_model, test_xs, test_ys)

0.5660377358490566


In [None]:
## Add tokenized text to df

# Extract all of the words from our training set and count frequencies
word_counts = defaultdict(int)
for post in train_dataset_df.selftext.iteritems():
    text = post[1].strip().lower()
    words = re.findall(r"[\w']+|[.,!?;]", text)
    for word in words:
        word_counts[word] += 1
len(word_counts)

# Create token mapping
word_to_token = dict()
token_to_word = dict()
word_to_token['<UNK>'] = 0
token_to_word[0] = '<UNK>'
i = 1
for word, count in word_counts.items():
    if count < 7:
        continue
    word_to_token[word] = i
    token_to_word[i] = word
    i += 1
len(word_to_token)

# Tokenize and detokenize
def tokenize_post(post):
    text = post.strip().lower()
    words = re.findall(r"[\w']+|[.,!?;]", text)
    output = []
    for word in words:
        output.append(word_to_token.get(word, 0))
    return torch.eye(len(word_to_token))[np.array(output)].sum(axis=0)     

# Tokenize reviews in train dataset
train_dataset_df['tokenized_selftext'] = train_dataset_df.selftext.apply(tokenize_post)
test_dataset_df['tokenized_selftext'] = test_dataset_df.selftext.apply(tokenize_post)

## Simple Feed Forward No Embeddings

class Feedforward(torch.nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(Feedforward, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.output_size = output_size
            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.relu = torch.nn.ReLU()
            self.fc2 = torch.nn.Linear(self.hidden_size, self.output_size)
            
        def forward(self, x):
            hidden = self.fc1(x)
            relu = self.relu(hidden)
            output = self.fc2(relu)
            return output

def build_features_and_ys(df, features):
    feature_df = df[features]
    xs = feature_df.apply(
        lambda x : np.hstack([np.array(a) for a in x]), axis=1
    ).tolist()
    label_index = sorted(train_dataset_df.label.unique())
    ys = np.array(list(map(label_index.index, df.label.to_list())))
    return torch.Tensor(xs), torch.LongTensor(ys)

def train_model(model, xs, ys, epochs=10, batch_size=10):
    device = torch.device('cpu')
    model.to(device)
    opt = optim.Adam(model.parameters(), lr=0.001)
    dataset = torch_data.TensorDataset(xs, ys)
    loader = torch_data.DataLoader(dataset, 
               batch_size=batch_size,
               shuffle=True)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        epoch_loss = 0
        for context, label in loader:
            opt.zero_grad()
            # Get predictions
            outputs = model(context)
            # Calculate loss
            loss = loss_fn(outputs, label)
            loss.backward()
            opt.step()
            epoch_loss += loss
        print(f"EPOCH {epoch} LOSS = {epoch_loss}")
    return model

model = Feedforward(len(word_to_token), 512, len(train_dataset_df.label.unique()))
xs, ys = build_features_and_ys(train_dataset_df, ['tokenized_selftext'])
trained_model = train_model(model, xs, ys)

## Validate

def get_model_accuracy(model, xs, ys):
    print(np.mean((model(xs).argmax(axis=1) == ys).numpy()))

test_xs, test_ys = build_features_and_ys(test_dataset_df, ['tokenized_selftext'])
get_model_accuracy(trained_model, test_xs, test_ys)