# Homework 5

In [None]:
!pip install nbconvert

## Problem 1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### 1.1

In [None]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')


df = pd.read_csv('/content/IMDB Dataset.csv', usecols=["review", "sentiment"], encoding='latin-1')


df.sentiment = (df.sentiment == "positive").astype("int")
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
val_size = int(df.shape[0] * 0.15)
test_size = int(df.shape[0] * 0.15)


def train_val_test_split(df=None, train_percent=0.7, test_percent=0.15, val_percent=0.15):
  df = df.sample(frac=1)
  train_df = df[: int(len(df)*train_percent)]
  test_df = df[int(len(df)*train_percent)+1 : int(len(df)*(train_percent+test_percent))]
  val_df = df[int(len(df)*(train_percent + test_percent))+1 : ]
  return train_df, test_df, val_df

train_df, test_df, val_df = train_val_test_split(df, 0.7, 0.15, 0.15)
train_labels, train_texts = train_df.values[:,1], train_df.values[:,0]
val_labels, val_texts = val_df.values[:,1], val_df.values[:,0]
test_labels, test_texts = test_df.values[:,1], test_df.values[:,0]
print(len(train_df), len(test_df), len(val_df))
print(len(train_texts), len(train_labels), len(val_df))

35000 7499 7499
35000 35000 7499


In [None]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

def process_tokens(text):
    """
    function to process tokens, replace any unwanted chars
    """
    preprocessed_text = text.lower().replace(",", "").replace(".", "").replace(":", "").replace(")", "").replace("-", "").replace("(", "")
    preprocessed_text = ''.join([i for i in preprocessed_text if not preprocessed_text.isdigit()])
    return preprocessed_text

def preprocessing(data):
    """
    preprocessing data to list of tokens
    """
    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)
    preprocessed_data = []
    for sentence in data:
        sentence = process_tokens(sentence)
        tokens = tokenizer(sentence)
        tlist = []
        for token in tokens:
            tlist.append(str(token))
        preprocessed_data.append(tlist)
    return preprocessed_data

train_data = preprocessing(train_texts)
val_data = preprocessing(val_texts)
test_data = preprocessing(test_texts)

In [None]:

import itertools

## Creating a vectorizer to vectorize text and create matrix of features
## Bag of words technique
class Vectorizer():
    def __init__(self, max_features):
        self.max_features = max_features
        self.vocab_list = None
        self.token_to_index = None

    def fit(self, dataset):
        word_dict = {}
        for sentence in dataset:
            for token in sentence:
                if token not in word_dict:
                    word_dict[token] = 1
                else:
                    word_dict[token] += 1
        word_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))
        end_to_slice = min(len(word_dict), self.max_features)
        word_dict = dict(itertools.islice(word_dict.items(), end_to_slice))
        self.vocab_list = list(word_dict.keys())
        self.token_to_index = {}
        counter = 0
        for token in self.vocab_list:
            self.token_to_index[token] = counter
            counter += 1


    def transform(self, dataset):
        data_matrix = np.zeros((len(dataset), len(self.vocab_list)))
        for i, sentence in enumerate(dataset):
            for token in sentence:
                if token in self.token_to_index:
                    data_matrix[i, self.token_to_index[token]] += 1
        return data_matrix

## max features - top k words to consider only
max_features = 2000

vectorizer = Vectorizer(max_features=max_features)
vectorizer.fit(train_data)

## Checking if the len of vocab = k
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

vocab = vectorizer.vocab_list

In [None]:
## each sequence of token is a vector of
## token indices (with the count of those words)
X_train[:5]

array([[18.,  6., 12., ...,  0.,  0.,  0.],
       [11.,  3.,  1., ...,  0.,  0.,  0.],
       [16.,  6.,  4., ...,  0.,  0.,  0.],
       [38., 15., 15., ...,  0.,  0.,  0.],
       [ 7.,  4.,  5., ...,  0.,  0.,  0.]])

In [None]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')



In [None]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
y_val = to_categorical(y_val, 2)

In [None]:
X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (35000, 1, 2000), y_train.shape: (35000, 2)


### 1.2

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN, Dropout
from tensorflow.keras.optimizers import Adam

model = None
model = Sequential()
model.add(SimpleRNN(256, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_1 (SimpleRNN)    (None, 256)               577792    
                                                                 
 dense_1 (Dense)             (None, 2)                 514       
                                                                 
Total params: 578306 (2.21 MB)
Trainable params: 578306 (2.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [None]:
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Test loss: 0.5368654727935791
Test accuracy: 0.8617148995399475


### 1.3

In [None]:
from tensorflow.keras.layers import LSTM

model = None
model = Sequential()
model.add(LSTM(256, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 256)               2311168   
                                                                 
 dense_3 (Dense)             (None, 2)                 514       
                                                                 
Total params: 2311682 (8.82 MB)
Trainable params: 2311682 (8.82 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


In [None]:
score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Test loss: 0.6114386916160583
Test accuracy: 0.8590478897094727


### 1.4

In [None]:
# getting data ready
X_train = vectorizer.transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

In [None]:
y_train = y_train.astype('int')
y_val = y_val.astype('int')
y_test = y_test.astype('int')

In [None]:
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
y_val = to_categorical(y_val, 2)

In [None]:
X_train = X_train.reshape(-1, 1, X_train.shape[1])
X_val = X_val.reshape(-1, 1, X_val.shape[1])
X_test = X_test.reshape(-1, 1, X_test.shape[1])

y_train = y_train.reshape(-1, 2)
y_val = y_val.reshape(-1, 2)
y_test = y_test.reshape(-1, 2)

print(f'X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}')

X_train.shape: (35000, 1, 2000), y_train.shape: (35000, 2)


In [None]:
from tensorflow.keras.layers import GRU

model = None
model = Sequential()
model.add(GRU(256, input_shape=(1, max_features)))
model.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 256)               1734144   
                                                                 
 dense_4 (Dense)             (None, 2)                 514       
                                                                 
Total params: 1734658 (6.62 MB)
Trainable params: 1734658 (6.62 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
Test loss: 0.6524178385734558
Test accuracy: 0.8597146272659302


In [None]:
# check predictions
from tensorflow.keras.backend import argmax

y_pred = model.predict(X_test)
for i in range(5):
  print(f'Label predicted: {argmax(y_pred[i]).numpy()}, Actual label: {argmax(y_test[i]).numpy()}')
  print(f'text: {test_texts[i]}')

Label predicted: 1, Actual label: 1
text: Holden and Jones SIZZLE in this movie, but not in the way we think of sizzling today -- it's very subtle and under the surface -- yet palpable. Jennifer Jones, in particular, is SO SEXUALLY HOT in this film (much more than a caricature like Monroe EVER was) because she creates a real woman -- with ALL facets of womanhood: She's intelligent, intuitive, graceful. She's desiring AND desirable. <br /><br />There's a scene on that famous hill, where she's lying down in the grass, looking up at Holden, and the expression in her eyes is X-rated, yet in the context of the scene and character, in makes complete sense. You don't need to have it all said in the dialogue -- spelled-out like the crude obviousness in most modern films. It's all there in her eyes -- sexy yet elegant. What a stunning, under-rated actress she was. (I saw her MADAME BOVARY for the first time recently and was equally blown away.) I'll take her over Bergman, Davis, or the two Hepb

### 1.5

In [None]:
from tensorflow.keras.layers import LSTM, Bidirectional

model5 = None
model5 = Sequential()
model5.add(Bidirectional(LSTM(256, input_shape=(1, max_features))))
model5.add(Dense(2, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,
              metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train,
          batch_size=256,
          validation_data=(X_val, y_val),
          epochs=10)
print(history.history.keys())

score, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score)
print('Test accuracy:', acc)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 256)               1734144   
                                                                 
 dense_4 (Dense)             (None, 2)                 514       
                                                                 
Total params: 1734658 (6.62 MB)
Trainable params: 1734658 (6.62 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
Test loss: 1.01004159450531
Test accuracy: 0.8645152449607849


### 1.6

**Answer:**

RNN
- Test loss: 0.5368654727935791
- Test accuracy: 0.8617148995399475

LSTM
- Test loss: 0.6114386916160583
- Test accuracy: 0.8590478897094727

GRU
- Test loss: 0.6524178385734558
- Test accuracy: 0.8597146272659302

BiLSTM
- Test loss: 1.01004159450531
- Test accuracy: 0.8645152449607849


 RNN model showed the lowest test loss at 0.537 with an accuracy of 86.17%. The LSTM model had slightly higher loss and comparable accuracy, while the GRU's performance was marginally lower in both aspects. The BiLSTM, despite having the highest test loss of 1.01, achieved the best accuracy at 86.45%. This suggests that while BiLSTM models may be less efficient in minimizing loss, they could be slightly more accurate in predictions. So best performace with best accuracy is bidirectional LSTM

## Problem 2

### 2.1

In [1]:
# and put in a ``data/`` directory under the current directory.
#
# After that, let’s import some necessities.
#

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import json


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

corpus_name = "movie-corpus"
corpus = os.path.join("/content/drive/MyDrive/data", corpus_name)

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus, "utterances.jsonl"))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
b'{"id": "L1045", "conversation_id": "L1044", "text": "They do not!", "speaker": "u0", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "not", "tag": "RB", "dep": "neg", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": "L1044", "timestamp": null, "vectors": []}\n'
b'{"id": "L1044", "conversation_id": "L1044", "text": "They do to!", "speaker": "u2", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "to", "tag": "TO", "dep": "dobj", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": null, 

In [3]:
# Splits each line of the file to create lines and conversations
def loadLinesAndConversations(fileName):
    lines = {}
    conversations = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            lineJson = json.loads(line)
            # Extract fields for line object
            lineObj = {}
            lineObj["lineID"] = lineJson["id"]
            lineObj["characterID"] = lineJson["speaker"]
            lineObj["text"] = lineJson["text"]
            lines[lineObj['lineID']] = lineObj

            # Extract fields for conversation object
            if lineJson["conversation_id"] not in conversations:
                convObj = {}
                convObj["conversationID"] = lineJson["conversation_id"]
                convObj["movieID"] = lineJson["meta"]["movie_id"]
                convObj["lines"] = [lineObj]
            else:
                convObj = conversations[lineJson["conversation_id"]]
                convObj["lines"].insert(0, lineObj)
            conversations[convObj["conversationID"]] = convObj

    return lines, conversations


# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations.values():
        # Iterate over all the lines of the conversation
        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [4]:
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict and conversations dict
lines = {}
conversations = {}
# Load lines and conversations
print("\nProcessing corpus into lines and conversations...")
lines, conversations = loadLinesAndConversations(os.path.join(corpus, "utterances.jsonl"))

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)


Processing corpus into lines and conversations...

Writing newly formatted file...

Sample lines from file:
b'They do to!\tThey do not!\n'
b'She okay?\tI hope so.\n'
b"Wow\tLet's go.\n"
b'"I\'m kidding.  You know how sometimes you just become this ""persona""?  And you don\'t know how to quit?"\tNo\n'
b"No\tOkay -- you're gonna need to learn how to lie.\n"
b"I figured you'd get to the good stuff eventually.\tWhat good stuff?\n"
b'What good stuff?\t"The ""real you""."\n'
b'"The ""real you""."\tLike my fear of wearing pastels?\n'
b'do you listen to this crap?\tWhat crap?\n'
b"What crap?\tMe.  This endless ...blonde babble. I'm like, boring myself.\n"


In [5]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [6]:
MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using the ``filterPair`` condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 221282 sentence pairs
Trimmed to 64313 sentence pairs
Counting words...
Counted words: 18082

pairs:
['they do to !', 'they do not !']
['she okay ?', 'i hope so .']
['wow', 'let s go .']
['what good stuff ?', 'the real you .']
['the real you .', 'like my fear of wearing pastels ?']
['do you listen to this crap ?', 'what crap ?']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['have fun tonight ?', 'tons']


In [7]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7833 / 18079 = 0.4333
Trimmed from 64313 pairs to 53131, 0.8261 of total


Prepare Data for Models

In [8]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[ 128,  595,  109,    8,   19],
        [  14,  210,   24,   48,   10],
        [6710,    6, 1347,   10,    2],
        [  14,   36,   10,    2,    0],
        [ 254,   17,    2,    0,    0],
        [2428, 1319,    0,    0,    0],
        [  10,    6,    0,    0,    0],
        [   2,    2,    0,    0,    0]])
lengths: tensor([8, 8, 5, 4, 3])
target_variable: tensor([[ 104,  162,  280,  317,  829],
        [ 246,   85,   40,    8,  112],
        [ 135, 4592,    2,  531,   14],
        [  44,   14,    0, 1629,    2],
        [ 140,    2,    0,    8,    0],
        [  14,    0,    0,    4,    0],
        [   2,    0,    0,   14,    0],
        [   0,    0,    0,    2,    0]])
mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True, False,  True,  True],
        [ True,  True, False,  True, False],
        [ True, False, False,  True, False],
        [ True, 

Define Models

In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size parameters are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

In [10]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [11]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

Define Training Procedure

In [12]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [13]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for RNN packing should always be on the CPU
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropagation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [14]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

Define Evaluation

In [15]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [16]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

Run the model

In [17]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#``attn_model = 'general'``
#``attn_model = 'concat'``
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000

In [18]:
# loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                     '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                     '{}_checkpoint.tar'.format(checkpoint_iter))
loadFilename = None

In [19]:
# Load model if a ``loadFilename`` is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


Run the training

Building optimizers ...


In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have CUDA, configure CUDA to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 3.6882
Iteration: 2; Percent complete: 0.1%; Average loss: 3.7704
Iteration: 3; Percent complete: 0.1%; Average loss: 3.9666
Iteration: 4; Percent complete: 0.1%; Average loss: 3.7935
Iteration: 5; Percent complete: 0.1%; Average loss: 3.8803
Iteration: 6; Percent complete: 0.1%; Average loss: 4.0122
Iteration: 7; Percent complete: 0.2%; Average loss: 3.9117
Iteration: 8; Percent complete: 0.2%; Average loss: 3.7307
Iteration: 9; Percent complete: 0.2%; Average loss: 3.7325
Iteration: 10; Percent complete: 0.2%; Average loss: 3.5104
Iteration: 11; Percent complete: 0.3%; Average loss: 3.5379
Iteration: 12; Percent complete: 0.3%; Average loss: 3.9373
Iteration: 13; Percent complete: 0.3%; Average loss: 3.9910
Iteration: 14; Percent complete: 0.4%; Average loss: 3.9903
Iteration: 15; Percent complete: 0.4%; Average loss: 3.7628
Iteration: 16; Percent complete: 0.4%

In [None]:
# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)

> hi
Bot: hi . . . . .
> hello
Bot: hello . . . hello .
> hey
Bot: hey . ? ? ? ?
> are you ok
Bot: i m sorry . ? ? ?
> how are you
Bot: i ll be back . you .
> i love movie
Bot: i love you . you know . .
> good move
Bot: i ll be right back . . .
> bye
Bot: bye . . . . .
> okay
Bot: okay . ? ? ? ?
> quit


### 2.2 & 2.3

In [None]:
!pip install wandb==0.16.5

In [22]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [23]:
sweep_config = {
    'method': 'random'
    }

In [24]:
metric = {
    'name': 'loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

In [25]:
parameters_dict = {
    'optimizer': {
        'values': ['adam', 'sgd']
        },
    'learning_rate':{
        'values':[0.0001, 0.00025, 0.0005, 0.001]
    },
    'clip':{
        'values':[0, 25, 50, 100]

    },
     'teacher_forcing_ratio':{
       'values':[0, 0.5, 1.0]
    },
    'decoder_learning_ratio':{
        'values':[1.0, 3.0, 5.0, 10.0]
    },

    }

sweep_config['parameters'] = parameters_dict

In [26]:
import pprint

pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'clip': {'values': [0, 25, 50, 100]},
                'decoder_learning_ratio': {'values': [1.0, 3.0, 5.0, 10.0]},
                'learning_rate': {'values': [0.0001, 0.00025, 0.0005, 0.001]},
                'optimizer': {'values': ['adam', 'sgd']},
                'teacher_forcing_ratio': {'values': [0, 0.5, 1.0]}}}


### 2.4

In [27]:
import torch
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
from torchvision import datasets, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_new(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        # loader = build_dataset(config.batch_size)
        # network = build_network(config.fc_layer_size, config.dropout)
        # optimizer = build_optimizer(network, config.optimizer, config.learning_rate)

        optimizer = config.optimizer
        learning_rate = config.learning_rate
        clip = config.clip
        decoder_learning_ratio = config.decoder_learning_ratio
        teacher_forcing_ratio = config.teacher_forcing_ratio

        # for epoch in range(config.epochs):
        #     avg_loss = train_epoch(network, loader, optimizer)
        #     wandb.log({"loss": avg_loss, "epoch": epoch})

        # Configure models
        model_name = 'cb_model'
        attn_model = 'dot'
        #``attn_model = 'general'``
        #``attn_model = 'concat'``
        hidden_size = 500
        encoder_n_layers = 2
        decoder_n_layers = 2
        dropout = 0.1
        batch_size = 64

        # Set checkpoint to load from; set to None if starting from scratch
        loadFilename = None
        checkpoint_iter = 4000

        if loadFilename:
            # If loading on same machine the model was trained on
            checkpoint = torch.load(loadFilename)
            # If loading a model trained on GPU to CPU
            #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
            encoder_sd = checkpoint['en']
            decoder_sd = checkpoint['de']
            encoder_optimizer_sd = checkpoint['en_opt']
            decoder_optimizer_sd = checkpoint['de_opt']
            embedding_sd = checkpoint['embedding']
            voc.__dict__ = checkpoint['voc_dict']


        print('Building encoder and decoder ...')
        # Initialize word embeddings
        embedding = nn.Embedding(voc.num_words, hidden_size)
        if loadFilename:
            embedding.load_state_dict(embedding_sd)
        # Initialize encoder & decoder models
        encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
        decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
        if loadFilename:
            encoder.load_state_dict(encoder_sd)
            decoder.load_state_dict(decoder_sd)
        # Use appropriate device
        encoder = encoder.to(device)
        decoder = decoder.to(device)
        print('Models built and ready to go!')

        # Configure training/optimization
        # clip = 50.0
        # teacher_forcing_ratio = 1.0
        # learning_rate = 0.0001
        # decoder_learning_ratio = 5.0
        n_iteration = 4000
        print_every = 1
        save_every = 500

        # Ensure dropout layers are in train mode
        encoder.train()
        decoder.train()

        # Initialize optimizers
        print('Building optimizers ...')
        encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
        if loadFilename:
            encoder_optimizer.load_state_dict(encoder_optimizer_sd)
            decoder_optimizer.load_state_dict(decoder_optimizer_sd)

        # If you have CUDA, configure CUDA to call
        for state in encoder_optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.cuda()

        for state in decoder_optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.cuda()

        # Run training iterations
        print("Starting Training!")
        # trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
        #           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
        #           print_every, save_every, clip, corpus_name, loadFilename)
        # use customized loop
        # Training loop
        print("Training...")
        for iteration in range(1, n_iteration + 1):
            training_batch = batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
            input_variable, lengths, target_variable, mask, max_target_len = training_batch

            loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                         decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
            print_loss = loss

            wandb.log({"loss": print_loss, "iteration": iteration})

            if iteration % print_every == 0:
                print_loss_avg = print_loss / print_every
                print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(
                    iteration, iteration / n_iteration * 100, print_loss_avg))

            if iteration % save_every == 0:
                directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(
                    encoder_n_layers, decoder_n_layers, hidden_size))
                if not os.path.exists(directory):
                    os.makedirs(directory)
                torch.save({
                    'iteration': iteration,
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict(),
                    'en_opt': encoder_optimizer.state_dict(),
                    'de_opt': decoder_optimizer.state_dict(),
                    'loss': loss,
                    'voc_dict': voc.__dict__,
                    'embedding': embedding.state_dict()
                }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
    wandb.finish()

In [28]:
sweep_id = wandb.sweep(sweep_config, project="yzhong5")

Create sweep with ID: triyrjye
Sweep URL: https://wandb.ai/yzhong/yzhong5/sweeps/triyrjye


In [30]:
wandb.agent(sweep_id, train_new, count=5)

[34m[1mwandb[0m: Agent Starting Run: pnsyt7o5 with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.9875
Iteration: 2; Percent complete: 0.1%; Average loss: 8.6982
Iteration: 3; Percent complete: 0.1%; Average loss: 7.7635
Iteration: 4; Percent complete: 0.1%; Average loss: 6.7982
Iteration: 5; Percent complete: 0.1%; Average loss: 6.7889
Iteration: 6; Percent complete: 0.1%; Average loss: 6.0773
Iteration: 7; Percent complete: 0.2%; Average loss: 5.7678
Iteration: 8; Percent complete: 0.2%; Average loss: 5.5397
Iteration: 9; Percent complete: 0.2%; Average loss: 5.4165
Iteration: 10; Percent complete: 0.2%; Average loss: 4.9601
Iteration: 11; Percent complete: 0.3%; Average loss: 5.1818
Iteration: 12; Percent complete: 0.3%; Average loss: 5.2148
Iteration: 13; Percent complete: 0.3%; Average loss: 4.8280
Iteration: 14; Percent complete: 0.4%; Average loss: 5.0350
Iteration: 15; Percent complete: 0.4%; Average loss

VBox(children=(Label(value='0.250 MB of 0.250 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,██▇▆▅▆▄▅▄▄▄▅▄▄▄▃▄▄▃▃▄▂▃▃▃▃▃▂▃▂▂▂▂▂▁▂▂▁▁▁

0,1
iteration,4000.0
loss,2.43595


[34m[1mwandb[0m: Agent Starting Run: q2mb1m8a with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.00025
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.9581
Iteration: 2; Percent complete: 0.1%; Average loss: 8.8688
Iteration: 3; Percent complete: 0.1%; Average loss: 8.6987
Iteration: 4; Percent complete: 0.1%; Average loss: 8.4805
Iteration: 5; Percent complete: 0.1%; Average loss: 8.1884
Iteration: 6; Percent complete: 0.1%; Average loss: 7.8015
Iteration: 7; Percent complete: 0.2%; Average loss: 7.5835
Iteration: 8; Percent complete: 0.2%; Average loss: 6.9703
Iteration: 9; Percent complete: 0.2%; Average loss: 7.0004
Iteration: 10; Percent complete: 0.2%; Average loss: 6.9972
Iteration: 11; Percent complete: 0.3%; Average loss: 6.6916
Iteration: 12; Percent complete: 0.3%; Average loss: 6.6059
Iteration: 13; Percent complete: 0.3%; Average loss: 6.3023
Iteration: 14; Percent complete: 0.4%; Average loss: 6.2426
Iteration: 15; Percent complete: 0.4%; Average loss

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▆▆▆▅▅▄▅▄▄▃▄▃▃▃▄▃▄▂▃▃▃▃▂▂▃▃▃▂▂▂▂▁▁▂▂▂▂▁▁

0,1
iteration,4000.0
loss,2.8614


[34m[1mwandb[0m: Agent Starting Run: wl7lk8xw with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_learning_ratio: 10
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.9628
Iteration: 2; Percent complete: 0.1%; Average loss: 8.7404
Iteration: 3; Percent complete: 0.1%; Average loss: 8.3520
Iteration: 4; Percent complete: 0.1%; Average loss: 7.5106
Iteration: 5; Percent complete: 0.1%; Average loss: 6.7698
Iteration: 6; Percent complete: 0.1%; Average loss: 6.8245
Iteration: 7; Percent complete: 0.2%; Average loss: 6.3361
Iteration: 8; Percent complete: 0.2%; Average loss: 6.1571
Iteration: 9; Percent complete: 0.2%; Average loss: 5.9400
Iteration: 10; Percent complete: 0.2%; Average loss: 5.8725
Iteration: 11; Percent complete: 0.3%; Average loss: 5.8098
Iteration: 12; Percent complete: 0.3%; Average loss: 5.3917
Iteration: 13; Percent complete: 0.3%; Average loss: 5.0218
Iteration: 14; Percent complete: 0.4%; Average loss: 4.8639
Iteration: 15; Percent complete: 0.4%; Average loss

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▆▆▅▅▆▅▄▄▄▄▄▄▅▃▅▃▄▃▄▃▄▃▃▃▃▃▂▃▂▂▃▂▂▂▁▂▂▂▂

0,1
iteration,4000.0
loss,2.2918


[34m[1mwandb[0m: Agent Starting Run: 89cogy5z with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_learning_ratio: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.9705
Iteration: 2; Percent complete: 0.1%; Average loss: 8.9749
Iteration: 3; Percent complete: 0.1%; Average loss: 8.9710
Iteration: 4; Percent complete: 0.1%; Average loss: 8.9681
Iteration: 5; Percent complete: 0.1%; Average loss: 8.9699
Iteration: 6; Percent complete: 0.1%; Average loss: 8.9733
Iteration: 7; Percent complete: 0.2%; Average loss: 8.9706
Iteration: 8; Percent complete: 0.2%; Average loss: 8.9741
Iteration: 9; Percent complete: 0.2%; Average loss: 8.9687
Iteration: 10; Percent complete: 0.2%; Average loss: 8.9673
Iteration: 11; Percent complete: 0.3%; Average loss: 8.9663
Iteration: 12; Percent complete: 0.3%; Average loss: 8.9738
Iteration: 13; Percent complete: 0.3%; Average loss: 8.9715
Iteration: 14; Percent complete: 0.4%; Average loss: 8.9700
Iteration: 15; Percent complete: 0.4%; Average loss

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,▃▄▅▇▁▅▄▄▇▂█▄█▅▅▃▆▆▅▆▄▄▄▆▄▄▂▅▅▁▅▅▄▄▅▆▅▄▃▅

0,1
iteration,4000.0
loss,8.97204


[34m[1mwandb[0m: Agent Starting Run: gpa8w8jz with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.00025
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.9876
Iteration: 2; Percent complete: 0.1%; Average loss: 8.9863
Iteration: 3; Percent complete: 0.1%; Average loss: 8.9885
Iteration: 4; Percent complete: 0.1%; Average loss: 8.9785
Iteration: 5; Percent complete: 0.1%; Average loss: 8.9815
Iteration: 6; Percent complete: 0.1%; Average loss: 8.9835
Iteration: 7; Percent complete: 0.2%; Average loss: 8.9827
Iteration: 8; Percent complete: 0.2%; Average loss: 8.9876
Iteration: 9; Percent complete: 0.2%; Average loss: 8.9871
Iteration: 10; Percent complete: 0.2%; Average loss: 8.9839
Iteration: 11; Percent complete: 0.3%; Average loss: 8.9843
Iteration: 12; Percent complete: 0.3%; Average loss: 8.9865
Iteration: 13; Percent complete: 0.3%; Average loss: 8.9829
Iteration: 14; Percent complete: 0.4%; Average loss: 8.9838
Iteration: 15; Percent complete: 0.4%; Average loss

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,▃▁▅▆▃▅▄▃▆▅█▂▄▅▅▄▄▅▃▃▅▂▄▃▄▄▅▅▃▃▄▃▃▃▅▃▃▃▂▅

0,1
iteration,4000.0
loss,8.98338


### 2.5

In [36]:
print(sweep_id)

triyrjye


In [37]:
api = wandb.Api()

project = 'yzhong5/triyrjye'
sweep = api.sweep(f"{project}")
runs = sorted(sweep.runs, key=lambda r: r.summary.get("loss", float("inf")))
best_run = runs[0]
best_hyperparameters = best_run.config
print(best_hyperparameters)
print("Best job id is", best_run.id)

{'clip': 50, 'optimizer': 'sgd', 'learning_rate': 0.0005, 'teacher_forcing_ratio': 0.5, 'decoder_learning_ratio': 3}
Best job id is lt0u1bab


In [38]:
wandb.init()
wandb.save(f'yzhong/lt0u1bab"')

[]

In [None]:
# show img in drive
from IPython.display import Image
Image(filename='/content/W&B Chart 4_7_2024, 11_20_44 PM.png')

In [None]:
Image(filename='/content/W&B Chart 4_7_2024, 11_23_14 PM.png')

The model with lowest loss has been saved as indicated above. 'clip': 50, 'optimizer': 'sgd', 'learning_rate': 0.0005, 'teacher_forcing_ratio': 0.5, 'decoder_learning_ratio': 3

Gradient clipping prevents gradient explosion, aiding in stability, while Adam optimizes the learning process with its adaptive learning rate adjustments. The absence of teacher forcing suggests a model that learns without relying too heavily on previous true outputs, aiming for better generalization. Lastly, the high decoder learning ratio points towards aggressive updates for the decoder, potentially to address more complex decoding tasks.



## Problem 3

### 3.1

Three in total.

Query vector (Q): query vector compute the similarity or attention score with respect to all other input vectors. It represents the relevance of the current input vector to all other input vectors.

Key vector (K): used to compute the similarity or attention score with the query vector. It represents the importance or relevance of each input vector to the current query vector.

Value vector (V): represents the actual content or meaning of each input vector. It compute the weighted sum of the input vectors based on the attention scores obtained from the query and key vectors.

(Q, K, and V) are obtained by multiplying the input vector with three learnable weight matrices (WQ, WK, and WV) respectively

### 3.2

$$\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

where:

$Q$ is the query matrix of shape $(n_q, d_k)$,
$K$ is the key matrix of shape $(n_k, d_k)$,
$V$ is the value matrix of shape $(n_k, d_v)$,
$n_q$ is the number of query vectors,
$n_k$ is the number of key/value vectors,
$d_k$ is the dimension of the key vectors,
$d_v$ is the dimension of the value vectors,

The attention scores are computed by taking the dot product between the query matrix (Q) and the transpose of the key matrix (K), scaled by a factor of $\frac{1}{\sqrt{d_k}}$. The softmax function is then applied to the scaled attention scores to obtain the attention weights. Finally, the attention weights are multiplied with the value matrix (V) to generate the self-attention output.



### 3.3


the total number of weight matrices needed:

Each head requires 3 weight matrices (WQ, WK, WV)

There are 8 heads in total.

Therefore, the total number of weight matrices across all heads is: 3 * 8 = 24

In total, there are 24 weight matrices (3 per head * 8 heads) to learn in the multi-head attention mechanism.

If an output matrix is used, the total number of weight matrices would be 32 =24 + 8.

The size of the input (e.g., 4 word embeddings) does not change the dimensions of the weight matrices, which remain 512 * 512.

### 3.4

Here's the process:

Concatenate the outputs from all attention heads:
Each head produces an output matrix of size (sequence_length, head_size)

Concatenate these matrices along the depth dimension, resulting in a matrix of size (sequence_length, num_heads * head_size)


Apply a linear transformation:
Multiply the concatenated matrix by a learned weight matrix W_O of size (num_heads * head_size, model_size)

This linear transformation maps the concatenated output to the desired input size for the feed-forward layer
Mathematically, it can be represented as:

$$
\text{FF input} = \text{Concat}(\text{head}_1, \text{head}_2, ..., \text{head}_n) \times W_O
$$

where:

FF_input is the input matrix for the feed-forward layer
${head}_i$ is the output matrix from the i-th attention head
$Concat$ is the concatenation operation along the depth dimension
$W_O$ is the learned weight matrix for the linear transformation
By concatenating the outputs from all heads and applying a linear transformation, we effectively combine the information from different attention heads and project it to the desired input size for the feed-forward layer.


## Problem 4

### 4.1

In [None]:
# Install the transformers library that will be used for BERT models.
!pip install transformers



In [None]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Get the pretrained 'bert-large-uncased-whole-word-masking-finetuned-squad' model from the BertForQuestionAnswering library
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')



tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
question = "What was BERT trained on?"

paragraph = "BERT stands for Bidirectional Encoder Representation of Transformer. I feel that its name itself is descriptive enough to get the gist. Still, to understand it better, it’s encoder part of the encoder-decoder transformer model, it’s also bidirectional in nature, which means that for any input it’s able to learn dependencies from both left and right of any word. It was trained on Wikipedia text and BooksCorpus and open-sourced back in 2018 by Google. You can find the official repository and paper at Github: BERT and BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. There are two models introduced in the paper. BERT base — 12 layers (transformer blocks), 110 million parameters. BERT Large — 24 layers, 340 million parameters. Later google also released Multi-lingual BERT to accelerate the research"

### 4.2

Use the encode_plus function. Define the text parameter as the question, and the text_pair as the paragraph.

You can refer to: https://huggingface.co/docs/transformers/v4.19.0/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__

In [None]:
encoding = tokenizer.encode_plus(text= question , text_pair= paragraph, add_special_tokens=True)

### 4.3

The encoding is a dictionary with multiple keys. Your task is to identify which keys will be used for the inputs and which will be used for the segment embeddings.

In [None]:
print(encoding.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:


inputs = encoding['input_ids']  #Token embeddings

sentence_embedding = encoding['token_type_ids'] #Segment embeddings


# we convert the input ids to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

In [None]:
scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
print(scores)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-5.4397, -5.1747, -8.2072, -8.1577, -7.4659, -6.3724, -9.4946, -5.4397,
         -1.1487, -5.9532, -7.5790, -2.0921, -7.5579, -7.1692, -7.1343, -4.7308,
         -6.7858, -7.5462, -5.3032, -7.8723, -5.3115, -7.4651, -7.9509, -6.8903,
         -7.9474, -8.6392, -6.2064, -7.5566, -8.5925, -8.2583, -6.7350, -8.4192,
         -8.4794, -7.7465, -8.4259, -7.5299, -8.5404, -9.1108, -7.8087, -8.6896,
         -7.1670, -7.7759, -8.2495, -8.5528, -8.7607, -5.9098, -8.4287, -8.4879,
         -6.4561, -7.5364, -8.4136, -6.9562, -8.3993, -6.9945, -4.6695, -6.9653,
         -7.7577, -7.9943, -5.2502, -7.7105, -5.6726, -8.0013, -5.9587, -8.3135,
         -6.2524, -8.2741, -8.4364, -8.1030, -3.8497, -8.1456, -8.0798, -8.2123,
         -8.8444, -7.8705, -8.4785, -7.6194, -7.5587, -8.2160, -6.8581, -6.2672,
         -5.9076, -6.9228, -8.1743, -8.5033, -6.8941, -7.6130, -6.1818, -6.1809,
         -7.9184, -8.4508, -7.5274, -7.6268, -8.8922, -7

### 4.4

Now we have start scores and end scores we can get both the start index and the end index and use both the indices for span prediction.

In [None]:
start_index = torch.argmax(scores.start_logits)

end_index = torch.argmax(scores.end_logits)


if end_index >= start_index:
    get = " ".join(tokens[start_index:end_index+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")

### 4.5

In [None]:
print(get)

wikipedia text and books ##corp ##us


### 4.6

Yes, we can see some unusual tokens in the answer, like a tag in some online posts. This is because of the bert spliting the word in a way that being mark by #. After some research, BERT uses a method called WordPiece tokenization. Less common words or parts of words are broken down into smaller pieces, with subsequent pieces in a word prefixed by "##" to indicate that they are not standalone tokens but parts of a larger word

## Problem 5

### 5.1

In [None]:
!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.1-py2.py3-none-any.whl (265.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.6/265.6 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h2o
Successfully installed h2o-3.46.0.1


#### 5.1.a


In [None]:
import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2ORandomForestEstimator
from h2o.automl import H2OAutoML

In [None]:
# Initialize H2O
h2o.init()

# Import the dataset
data_path = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip"
airlines = h2o.import_file(data_path)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.22" 2024-01-16; OpenJDK Runtime Environment (build 11.0.22+7-post-Ubuntu-0ubuntu222.04.1); OpenJDK 64-Bit Server VM (build 11.0.22+7-post-Ubuntu-0ubuntu222.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpj1bbl43o
  JVM stdout: /tmp/tmpj1bbl43o/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpj1bbl43o/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,25 days
H2O_cluster_name:,H2O_from_python_unknownUser_gz5bam
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
# Split the dataset
train, valid, test = airlines.split_frame(ratios=[0.7, 0.15], seed=123)


features = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]


# Identify predictor and response columns
predictors = features
response = "IsDepDelayed"


# if it's categorical
train[response] = train[response].asfactor()
valid[response] = valid[response].asfactor()
test[response] = test[response].asfactor()


hyper_params = {'ntrees': [10, 30, 50, 100], 'max_depth': [1, 2, 4, 6]}

rf = H2ORandomForestEstimator()

# Perform grid search
grid = H2OGridSearch(model=rf, hyper_params=hyper_params)

grid.train(x=predictors, y=response, training_frame=train, validation_frame=valid)



drf Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,max_depth,ntrees,model_ids,logloss
,6.0,30.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_8,0.6120569
,6.0,50.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_12,0.6122059
,6.0,100.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_16,0.6132804
,6.0,10.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_4,0.6169152
,4.0,10.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_3,0.629298
,4.0,30.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_7,0.6298912
,4.0,50.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_11,0.6311316
,4.0,100.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_15,0.6323539
,2.0,10.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_2,0.655363
,2.0,30.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_6,0.6555319


#### 5.1.b


In [None]:
results = grid.get_grid(sort_by='accuracy', decreasing=True)
print(results)


Hyper-Parameter Search Summary: ordered by decreasing accuracy
    max_depth    ntrees    model_ids                                                      accuracy
--  -----------  --------  -------------------------------------------------------------  ----------
    6            100       Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_16  0.675042
    6            30        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_8   0.673655
    6            50        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_12  0.672268
    6            10        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_4   0.668568
    4            50        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_11  0.661631
    4            30        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_7   0.65824
    4            10        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_3   0.658085
    4            100       Grid_DRF_py_14_sid

#### 5.1.c


In [None]:
print("Best model \n", results[0])

Best model 
 Model Details
H2ORandomForestEstimator : Distributed Random Forest
Model Key: Grid_DRF_py_14_sid_a062_model_python_1712528756749_1_model_16


Model Summary: 
    number_of_trees    number_of_internal_trees    model_size_in_bytes    min_depth    max_depth    mean_depth    min_leaves    max_leaves    mean_leaves
--  -----------------  --------------------------  ---------------------  -----------  -----------  ------------  ------------  ------------  -------------
    100                100                         121804                 6            6            6             33            64            57.27

ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.21549765419129988
RMSE: 0.4642172489161727
LogLoss: 0.6205170159253806
Mean Per-Class Error: 0.3966984387177883
AUC: 0.7177818232532792
AUCPR: 0.731945991053329
Gini: 0.43556364650655843

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3884473291111462
       NO    YES    Error    Rate
-----  ---- 

In [None]:
best_perf = results.models[0].model_performance(test)
print(f"Best Model is : {best_perf}")


Best Model is : ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.21549982034319615
RMSE: 0.4642195820333263
LogLoss: 0.620668590616193
Mean Per-Class Error: 0.40745803165412287
AUC: 0.7180331867526882
AUCPR: 0.7274701142205402
Gini: 0.43606637350537647

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3681556656118483
       NO    YES    Error    Rate
-----  ----  -----  -------  ---------------
NO     809   2334   0.7426   (2334.0/3143.0)
YES    251   3220   0.0723   (251.0/3471.0)
Total  1060  5554   0.3908   (2585.0/6614.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.368156     0.713573  318
max f2                       0.256804     0.847636  388
max f0point5                 0.505467     0.683879  221
max accuracy                 0.505467     0.66964   221
max precision                0.944199     1 

In [None]:
print(f"AUC score is : {best_perf.auc()}")

AUC score is : 0.7180331867526882


The evaluation and auc score is displayed above, the auc score is 0.718

### 5.2

#### 5.2.a

In [None]:
search = {"strategy": "RandomDiscrete", "max_models":10, "seed":123}

random_grid = H2OGridSearch(model=H2ORandomForestEstimator(seed = 123), hyper_params=hyper_params, search_criteria=search)
random_grid.train(x=predictors, y=response, training_frame=train, validation_frame=valid)

drf Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,max_depth,ntrees,model_ids,logloss
,6.0,30.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_2,0.6123707
,6.0,50.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_5,0.6124692
,4.0,30.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_4,0.6301777
,4.0,10.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_1,0.6336924
,2.0,30.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_10,0.6550326
,2.0,100.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_9,0.6551204
,2.0,50.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_3,0.6553098
,1.0,10.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_7,0.6707944
,1.0,50.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_6,0.6723072
,1.0,30.0,Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_8,0.6726622


#### 5.2.b

In [None]:
results_2 = random_grid.get_grid(sort_by='accuracy', decreasing=True)
print(results_2)

Hyper-Parameter Search Summary: ordered by decreasing accuracy
    max_depth    ntrees    model_ids                                                         accuracy
--  -----------  --------  ----------------------------------------------------------------  ----------
    6            50        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_5   0.674117
    6            30        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_2   0.672268
    4            30        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_4   0.660398
    4            10        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_1   0.651765
    2            100       Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_9   0.643441
    2            30        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_10  0.639741
    2            50        Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_3   0.63712
    1            3

#### 5.2.c

In [None]:
print("Best model \n", results_2[0])

Best model 
 Model Details
H2ORandomForestEstimator : Distributed Random Forest
Model Key: Grid_DRF_py_14_sid_a062_model_python_1712528756749_1534_model_5


Model Summary: 
    number_of_trees    number_of_internal_trees    model_size_in_bytes    min_depth    max_depth    mean_depth    min_leaves    max_leaves    mean_leaves
--  -----------------  --------------------------  ---------------------  -----------  -----------  ------------  ------------  ------------  -------------
    50                 50                          62638                  6            6            6             44            64            58.18

ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.2156547988348092
RMSE: 0.4643864757234099
LogLoss: 0.6205834848641933
Mean Per-Class Error: 0.40152953865489516
AUC: 0.7154114936655792
AUCPR: 0.7305922654796403
Gini: 0.4308229873311584

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3809492685379116
       NO    YES    Error    Rate
-----  ---

In [None]:
best_perf_2 = results_2.models[0].model_performance(test)
print(f"Best Model is : {best_perf_2}")


Best Model is : ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.21492272591836079
RMSE: 0.4635975905010301
LogLoss: 0.6192905629017454
Mean Per-Class Error: 0.3810010547829922
AUC: 0.7192865149748111
AUCPR: 0.7295473056366483
Gini: 0.4385730299496222

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.397837008368224
       NO    YES    Error    Rate
-----  ----  -----  -------  ---------------
NO     1122  2021   0.643    (2021.0/3143.0)
YES    413   3058   0.119    (413.0/3471.0)
Total  1535  5079   0.368    (2434.0/6614.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.397837     0.715322  297
max f2                       0.23517      0.847507  388
max f0point5                 0.513821     0.682656  218
max accuracy                 0.502835     0.666767  223
max precision                0.944532     1   

In [None]:
print(f"AUC score is : {best_perf_2.auc()}")

AUC score is : 0.7192865149748111


The auc score is 0.719

### 5.3

#### 5.3.a

In [None]:
from h2o.automl import H2OAutoML

automl = H2OAutoML(max_models=20, seed=123)
automl.train(x=features, y=response, training_frame=train, validation_frame=valid)

AutoML progress: |
22:42:18.916: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.

███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),7/20
# GBM base models (used / total),1/7
# XGBoost base models (used / total),5/6
# DRF base models (used / total),1/2
# DeepLearning base models (used / total),0/4
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,NO,YES,Error,Rate
NO,2784.0,1995.0,0.4175,(1995.0/4779.0)
YES,816.0,4476.0,0.1542,(816.0/5292.0)
Total,3600.0,6471.0,0.2791,(2811.0/10071.0)

metric,threshold,value,idx
max f1,0.420343,0.7610303,255.0
max f2,0.2527009,0.8590939,335.0
max f0point5,0.5788678,0.7561141,174.0
max accuracy,0.5127852,0.7349816,210.0
max precision,0.9704279,1.0,0.0
max recall,0.1092777,1.0,393.0
max specificity,0.9704279,1.0,0.0
max absolute_mcc,0.5127852,0.469793,210.0
max min_per_class_accuracy,0.5111018,0.733938,211.0
max mean_per_class_accuracy,0.5127852,0.7351695,210.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100288,0.9304943,1.9030612,1.9030612,1.0,0.94916,1.0,0.94916,0.0190854,0.0190854,90.3061224,90.3061224,0.0190854
2,0.0200576,0.9143429,1.884219,1.8936401,0.990099,0.9214788,0.9950495,0.9353194,0.0188964,0.0379819,88.4219034,89.3640129,0.0377726
3,0.030285,0.8973655,1.8291559,1.8718635,0.961165,0.904715,0.9836066,0.9249842,0.0187075,0.0566893,82.9155934,87.1863499,0.0556431
4,0.0400159,0.8862236,1.8836422,1.8747278,0.9897959,0.8917741,0.9851117,0.9169083,0.0183296,0.0750189,88.3642232,87.4727807,0.0737634
5,0.0500447,0.8756077,1.7711659,1.8539743,0.9306931,0.8807348,0.9742063,0.9096592,0.0177627,0.0927816,77.1165892,85.3974328,0.0900613
6,0.1000894,0.823546,1.7558005,1.8048874,0.922619,0.849734,0.9484127,0.8796966,0.0878685,0.18065,75.5800534,80.4887431,0.1697691
7,0.1500348,0.7826962,1.6193046,1.7431084,0.8508946,0.8014003,0.9159497,0.8536324,0.0808768,0.2615268,61.9304581,74.3108362,0.2349522
8,0.2000794,0.7436489,1.5858844,1.7037829,0.8333333,0.7628517,0.8952854,0.830926,0.0793651,0.3408919,58.5884354,70.3782853,0.2967404
9,0.3000695,0.6655885,1.4381625,1.615272,0.75571,0.7034058,0.8487756,0.7884333,0.143802,0.4846939,43.8162455,61.5272019,0.3890672
10,0.4002582,0.5894965,1.2485892,1.5234876,0.6560951,0.6274783,0.8005458,0.7481446,0.1250945,0.6097884,24.8589228,52.3487614,0.4415523

Unnamed: 0,NO,YES,Error,Rate
NO,1538.0,1567.0,0.5047,(1567.0/3105.0)
YES,536.0,2846.0,0.1585,(536.0/3382.0)
Total,2074.0,4413.0,0.3242,(2103.0/6487.0)

metric,threshold,value,idx
max f1,0.3977623,0.7302117,269.0
max f2,0.2027684,0.8495964,358.0
max f0point5,0.5761381,0.7194245,173.0
max accuracy,0.5334838,0.6955449,196.0
max precision,0.974855,1.0,0.0
max recall,0.1113941,1.0,392.0
max specificity,0.974855,1.0,0.0
max absolute_mcc,0.5727741,0.3965379,175.0
max min_per_class_accuracy,0.5124321,0.6936724,208.0
max mean_per_class_accuracy,0.5334838,0.6969342,196.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.01002,0.9295804,1.8590775,1.8590775,0.9692308,0.9465412,0.9692308,0.9465412,0.018628,0.018628,85.9077469,85.9077469,0.0179839
2,0.0201942,0.9130892,1.7727855,1.8156021,0.9242424,0.9198943,0.9465649,0.933116,0.0180367,0.0366647,77.2785513,81.5602133,0.0344103
3,0.0300601,0.8958571,1.7682446,1.8000591,0.921875,0.9038263,0.9384615,0.923503,0.0174453,0.05411,76.8244567,80.0059137,0.0502453
4,0.0400802,0.8837359,1.6525133,1.7631727,0.8615385,0.889643,0.9192308,0.915038,0.0165582,0.0706682,65.2513306,76.3172679,0.063905
5,0.0501002,0.8735067,1.8295683,1.7764518,0.9538462,0.878851,0.9261538,0.9078006,0.0183323,0.0890006,82.9568303,77.6451804,0.0812711
6,0.1000462,0.8220761,1.6457736,1.7112134,0.8580247,0.848648,0.8921418,0.8782699,0.0821999,0.1712005,64.5773558,71.1213357,0.1486562
7,0.1499923,0.7803718,1.6043332,1.6756233,0.8364198,0.7999777,0.8735868,0.8521993,0.0801301,0.2513306,60.4333217,67.5623259,0.211717
8,0.2000925,0.7417836,1.4282436,1.6136831,0.7446154,0.7601211,0.8412943,0.8291443,0.0715553,0.3228859,42.8243643,61.3683062,0.2565413
9,0.2999846,0.6666591,1.3941715,1.5405877,0.7268519,0.704359,0.803186,0.7875919,0.1392667,0.4621526,39.4171485,54.0587738,0.3388031
10,0.4000308,0.5863097,1.2206064,1.4605616,0.6363636,0.6280983,0.7614644,0.7477032,0.1221171,0.5842697,22.0606419,46.0561581,0.3849138

Unnamed: 0,NO,YES,Error,Rate
NO,5395.0,9244.0,0.6315,(9244.0/14639.0)
YES,1726.0,14512.0,0.1063,(1726.0/16238.0)
Total,7121.0,23756.0,0.3553,(10970.0/30877.0)

metric,threshold,value,idx
max f1,0.3391154,0.7257089,297.0
max f2,0.1802266,0.8490596,370.0
max f0point5,0.5463061,0.7073916,192.0
max accuracy,0.491186,0.6857208,221.0
max precision,0.9688223,1.0,0.0
max recall,0.0763103,1.0,399.0
max specificity,0.9688223,1.0,0.0
max absolute_mcc,0.5463061,0.3731922,192.0
max min_per_class_accuracy,0.5121193,0.6841997,210.0
max mean_per_class_accuracy,0.5254686,0.6866068,203.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100074,0.9298561,1.8276816,1.8276816,0.961165,0.945428,0.961165,0.945428,0.0182904,0.0182904,82.7681562,82.7681562,0.0174707
2,0.0200149,0.9096283,1.747682,1.7876818,0.9190939,0.919895,0.9401294,0.9326615,0.0174898,0.0357803,74.7682032,78.7681797,0.0332528
3,0.0300223,0.8961149,1.6984516,1.7579384,0.8932039,0.9027702,0.9244876,0.9226977,0.0169972,0.0527774,69.8451553,75.7938382,0.0479957
4,0.0400298,0.8848252,1.6799901,1.7384513,0.8834951,0.8904593,0.9142395,0.9146381,0.0168124,0.0695899,67.9990123,73.8451317,0.0623489
5,0.0500049,0.8727467,1.6730971,1.7254143,0.8798701,0.8784456,0.9073834,0.9074184,0.0166892,0.0862791,67.3097056,72.5414327,0.0765107
6,0.1000097,0.8224486,1.5899428,1.6576786,0.8361399,0.847069,0.8717617,0.8772437,0.0795049,0.165784,58.9942824,65.7678576,0.1387329
7,0.1500146,0.778119,1.5234386,1.6129319,0.8011658,0.7999816,0.8482297,0.8514896,0.0761793,0.2419633,52.3438632,61.2931928,0.1939409
8,0.2000194,0.7401551,1.4446188,1.5708537,0.759715,0.7582518,0.826101,0.8281802,0.072238,0.3142013,44.4618848,57.0853658,0.2408356
9,0.2999968,0.6658768,1.3563858,1.4993798,0.7133139,0.7024263,0.7885134,0.7862713,0.1356078,0.4498091,35.6385836,49.9379819,0.3159885
10,0.4000389,0.59024,1.2102307,1.4270691,0.6364519,0.6276335,0.7504858,0.746599,0.121074,0.5708831,21.0230701,42.7069131,0.3603496

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.6430497,0.0131028,0.6522514,0.6321763,0.647613,0.6264573,0.6567502
aic,7333.6743,71.592224,7313.7935,7241.367,7422.1733,7387.1924,7303.846
auc,0.7480907,0.0038784,0.7497125,0.7454857,0.7502742,0.7427333,0.7522478
err,0.3569504,0.0131028,0.3477486,0.3678237,0.3523870,0.3735428,0.3432498
err_count,2204.0,73.232506,2147.0,2236.0,2207.0,2307.0,2123.0
f0point5,0.6509396,0.0112060,0.6624826,0.6441893,0.6506318,0.6361803,0.661214
f1,0.7267803,0.0051597,0.7328605,0.723746,0.7267550,0.7199903,0.7305496
f2,0.8228208,0.0050618,0.8199688,0.8257217,0.8230510,0.8292328,0.8161297
lift_top_group,1.8283138,0.0403299,1.7766126,1.7983923,1.8347468,1.8600113,1.8718061
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 5.3.b

In [None]:
print(automl.leaderboard)

model_id                                                      auc    logloss     aucpr    mean_per_class_error      rmse       mse
StackedEnsemble_AllModels_1_AutoML_1_20240407_224218     0.748085   0.592288  0.762319                0.368879  0.450878  0.203291
StackedEnsemble_BestOfFamily_1_AutoML_1_20240407_224218  0.746904   0.593328  0.760384                0.368334  0.451345  0.203713
XRT_1_AutoML_1_20240407_224218                           0.741395   0.599411  0.75259                 0.360209  0.454079  0.206188
GBM_1_AutoML_1_20240407_224218                           0.741067   0.598434  0.753291                0.36638   0.453719  0.205861
XGBoost_grid_1_AutoML_1_20240407_224218_model_2          0.740166   0.598688  0.755788                0.387056  0.454016  0.20613
XGBoost_2_AutoML_1_20240407_224218                       0.740136   0.599693  0.754724                0.360422  0.454267  0.206359
XGBoost_grid_1_AutoML_1_20240407_224218_model_3          0.73937    0.601713  0.7539

In [None]:
best_automl = automl.leader

print("Best performing model is ", best_automl)


Best performing model is  Model Details
H2OStackedEnsembleEstimator : Stacked Ensemble
Model Key: StackedEnsemble_AllModels_1_AutoML_1_20240407_224218


Model Summary for Stacked Ensemble: 
key                                        value
-----------------------------------------  ----------------
Stacking strategy                          cross_validation
Number of base models (used / total)       7/20
# GBM base models (used / total)           1/7
# XGBoost base models (used / total)       5/6
# DRF base models (used / total)           1/2
# DeepLearning base models (used / total)  0/4
# GLM base models (used / total)           0/1
Metalearner algorithm                      GLM
Metalearner fold assignment scheme         Random
Metalearner nfolds                         5
Metalearner fold_column
Custom metalearner hyperparameters         None

ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.1803415921551519
RMSE: 0.42466644811563803
LogLoss: 0.5383824452

In [None]:
pprint.pprint(dict(best_automl.params))

{'auc_type': {'actual': 'AUTO', 'default': 'AUTO', 'input': 'AUTO'},
 'base_models': {'actual': [{'URL': None,
                             '__meta': {'schema_name': 'KeyV3',
                                        'schema_type': 'Key<Keyed>',
                                        'schema_version': 3},
                             'name': 'XRT_1_AutoML_1_20240407_224218',
                             'type': 'Key<Keyed>'},
                            {'URL': None,
                             '__meta': {'schema_name': 'KeyV3',
                                        'schema_type': 'Key<Keyed>',
                                        'schema_version': 3},
                             'name': 'GBM_1_AutoML_1_20240407_224218',
                             'type': 'Key<Keyed>'},
                            {'URL': None,
                             '__meta': {'schema_name': 'KeyV3',
                                        'schema_type': 'Key<Keyed>',
                                    

the best model is StackedEnsemble_AllModels_1_AutoML_1_20240407_224218. The params is display above

#### 5.3.c

In [None]:
auto_perform = best_automl.model_performance(test)

print(f"AUC of the best model for test set: {auto_perform.auc()}")

AUC of the best model for test set: 0.7487888603476301


#### 5.3.d

In [None]:
best_log_loss = automl.get_best_model(algorithm="xgboost", criterion="logloss")

print("Best XGBoost Model is: ", best_log_loss)

Best XGBoost Model is:  Model Details
H2OXGBoostEstimator : XGBoost
Model Key: XGBoost_grid_1_AutoML_1_20240407_224218_model_2


Model Summary: 
    number_of_trees
--  -----------------
    48

ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.18872289031509454
RMSE: 0.4344224790628295
LogLoss: 0.5584339604860925
Mean Per-Class Error: 0.30476326210902666
AUC: 0.7876046111886091
AUCPR: 0.8027110640244234
Gini: 0.5752092223772183

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.41800735350203727
       NO     YES    Error    Rate
-----  -----  -----  -------  ----------------
NO     8171   6468   0.4418   (6468.0/14639.0)
YES    2723   13515  0.1677   (2723.0/16238.0)
Total  10894  19983  0.2977   (9191.0/30877.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.418007     0.746252  258
max f2           

The best model is XGBoost_grid_1_AutoML_1_20240407_224218_model_2

In [None]:
bll_perform = best_log_loss.model_performance(test)

print(f"Log loss of XGBoost is : {bll_perform.logloss()}")

Log loss of XGBoost is : 0.5959236996780116
