In [7]:
import torch
from torchtext import data, datasets
import random

In [8]:
%config IPCompleter.greedy = True
SEED = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## 1. RNN for Language Modeling (40pt)

#### 1.1. Import the torchtext IMDB dataset

In [9]:
# Download pre-split train and test data
TEXT = data.Field() #default parse by spaces
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [10]:
print('Train Samples: {}, of type: {}'.format(len(train_data), type(train_data.examples[0])))
print(vars(train_data.examples[0]))
print()
print('Test Samples: {}, of type: {}'.format(len(test_data), type(test_data.examples[0])))
print(vars(test_data.examples[0]))

Train Samples: 25000, of type: <class 'torchtext.data.example.Example'>
{'text': ['I', "don't", 'know', 'who', 'could', 'find', 'fault', 'with', 'a', 'simply', 'human', 'and', 'funny', 'film', 'like', 'this', 'with', 'lots', 'of', 'delights', 'for', 'your', 'heart.', 'I', 'enjoyed', 'each', 'minute', 'of', 'it', 'and', 'guessed', 'the', 'ending', 'half', 'way', 'through', 'the', 'movie', '--', 'but', 'that', 'did', 'not', 'disappoint', 'me', 'at', 'all.', 'It', 'will', 'not', 'only', 'touch', 'your', 'heart', 'but', "it's", 'such', 'a', 'good', 'family', 'friendly', 'film--we', 'need', 'many', 'more', 'like', 'these!'], 'label': 'pos'}

Test Samples: 25000, of type: <class 'torchtext.data.example.Example'>
{'text': ['I', 'read', 'most', 'of', 'the', 'comments', 'here', 'were', 'everybody', 'saw', 'only', 'the', 'flaws', 'of', 'the', 'movie.', 'I', 'agree,', 'the', 'director', "it's", 'not', 'Kuprik,', 'the', 'actors', 'are', 'not', 'Oscar', 'winners,', 'but', 'it', 'has', 'something', 

In [11]:
# Split train data into train and validation data, default split is 70/30
train_data, valid_data = train_data.split(random_state = random.seed(SEED))
print('Train Samples: {}, of type: {}'.format(len(train_data), type(train_data.examples[0])))
print('Valid Samples: {}, of type: {}'.format(len(valid_data), type(valid_data.examples[0])))

Train Samples: 17500, of type: <class 'torchtext.data.example.Example'>
Valid Samples: 7500, of type: <class 'torchtext.data.example.Example'>


#### 1.2. Build a Markov (n-gram) language model

In [14]:
# Build a vocabulary of most common words in the train set
VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size = VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [16]:
print(vars(train_data.examples[0]))
print('TEXT Vocab Size:', len(TEXT.vocab))
print('LABEL Vocab Size:',len(LABEL.vocab))

{'text': ['This', 'was', 'just', 'plain', 'terrible.', 'I', 'read', 'this', 'book', 'for', 'school,', 'i', 'made', 'As', 'on', 'all', 'of', 'the', 'tests,', 'and', 'to', 'see', 'it', 'like', 'this!', 'My', 'teacher', 'forced', 'me', 'and', '20', 'other', 'people', 'to', 'watch', 'it,', 'and', 'it', 'was', 'worse', 'than', 'Leonard', 'Part', '6,', 'Plan', '9', 'from', 'Outer', 'Space,', 'and', 'Hudson', 'Hawk', 'put', 'together.', 'The', 'thing', 'that', 'made', 'this', 'film', 'so', 'terrible', 'was', 'enough', 'reasons', 'to', 'want', 'to', 'kill', 'yourself', 'over.', 'First', 'of', 'all,', 'it', 'was', 'made', 'on', 'Hallmark.', 'Second,', 'the', 'acting', 'was', 'terrible.', 'Third,', 'it', 'was', 'like', 'completely', 'different', 'from', 'the', 'book.', 'Literally,', 'it', 'was', 'so', 'bad', 'I', 'asked', 'myself', 'to', 'be', 'excused.', 'Basically,', 'I', 'would', 'rather', 'watch', 'Basic', 'Instinct', '2', 'than', 'watch', 'this.', 'Take', 'my', 'advice,', "don't", 'watch', 

#### 1.3. Change the output appropriately in ‘Simple Sentiment Analysis.ipynb’ to build an LSTM based language model. Plot the training performance as a function of epochs/iterations

#### 2. For each model, describe the key design choices made. Briefly mention how each choice influences training time and generative quality.

#### 3. For each model, starting with the phrase ”My favorite movie ”, sample the next few words and create a 20 word generated review. Repeat this 5 times (you should ideally get different outputs each time) and report the outputs.