# Deep Learning for Natural Language Processing

Steps
- Data preprocessing
    - Loading dataset
    - Cleaning dataset
- Loading pre-trained model
- Fine-tuning
- Evaluation

### Data preprocessing

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import (
    DataLoader, RandomSampler, \
    SequentialSampler, TensorDataset
) 
import zipfile
from time import time
# import seaborn as sns
from transformers import (
    BertModel, BertTokenizer, \
    get_linear_schedule_with_warmup
)
import warnings
warnings.filterwarnings("ignore")

SEED = 2022
BATCH_SIZE = 32
np.random.seed(SEED)
torch.manual_seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7fefdda00a70>

In [4]:
data_path = "archive.zip"
train_f = "Train.csv"
test_f = "Test.csv"
valid_f = "Valid.csv"

def read_data(data_path):
    zf = zipfile.ZipFile(data_path)
    train_df = pd.read_csv(zf.open(train_f))
    test_df = pd.read_csv(zf.open(test_f))
    valid_df = pd.read_csv(zf.open(valid_f))
    d = {"train":train_df, "test":test_df, "validation":valid_df}
    return d

data = read_data(data_path)
train = data['train']
train['set'] = 0
test = data['test']
test['set'] = 1
valid = data['validation']
valid['set'] = 2
df = pd.concat([train, test, valid], ignore_index=True)

In [5]:
train.head()

Unnamed: 0,text,label,set
0,I grew up (b. 1965) watching and loving the Th...,0,0
1,"When I put this movie in my DVD player, and sa...",0,0
2,Why do people who do not know what a particula...,0,0
3,Even though I have great interest in Biblical ...,0,0
4,Im a die hard Dads Army fan and nothing will e...,1,0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
 2   set     40000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 937.6+ KB


In [7]:
# Check that training data is balanced
train['label'].value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [8]:
PRE_TRAINED_MODEL = 'bert-base-cased'
MAX_LEN = 512
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL)

In [9]:
max_len = 0
for sent in train.text.values:
    encodings = tokenizer.encode(sent, add_special_tokens=True, truncate=True)
    max_len = max(max_len, len(encodings))

print("Maximum review length: ", max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (1420 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (722 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (912 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1233 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

KeyboardInterrupt: 

In [28]:
# Tokenize all data points in one function

# MAKE A NEW COLUMN, THEN SEPARATE THE SETS IN THE FOLLOWING CELL

encoded_data = df['text'].apply((lambda x: tokenizer.encode_plus(x, add_special_tokens=True, \
    max_length=MAX_LEN, pad_to_max_length=True, return_attention_mask=True, truncate=True, return_tensors='pt')))


0    [101, 146, 2580, 1146, 113, 171, 119, 2679, 11...
1    [101, 1332, 146, 1508, 1142, 2523, 1107, 1139,...
2    [101, 2009, 1202, 1234, 1150, 1202, 1136, 1221...
3    [101, 2431, 1463, 146, 1138, 1632, 2199, 1107,...
4    [101, 146, 1306, 170, 2939, 1662, 4708, 1116, ...
Name: text, dtype: object

In [29]:
# Add `0` as padding tokens to each review text to reach the maximum length set
padded_data = np.array([token_vec + [0]*(MAX_LEN-len(token_vec)) for token_vec in tokenized_data.values])

In [30]:
# Add masks of 0 where token is 0, 1 for where tokens are otherwise
mask = np.where(padded_data != 0, 1, 0)
assert(mask.shape==padded_data.shape)

In [35]:
model = BertModel.from_pretrained(PRE_TRAINED_MODEL)

In [38]:
# Convert input tokens and masks to Torch tensors
input_ids = torch.tensor(padded_data)
mask = torch.tensor(mask)

with torch.no_grad():
    last_h = model(input_ids, attention_mask = mask)

  mask = torch.tensor(mask)


RuntimeError: [enforce fail at CPUAllocator.cpp:68] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 78643200000 bytes. Error code 12 (Cannot allocate memory)

In [None]:
features = last_h[0][:,0,:].numpy()
features.head()

In [None]:
labels = df.label

In [None]:
X_train = features
X_test = 
X_valid = 
Y_train = 
Y_test = 
Y_valid = 

### Fine-tuning

In [None]:
optimizer = torch.optim.AdamW(model.parameters(),
            lr = 2e-5,
            eps = 1e-8)

In [None]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = torch.sigmoid(self.linear(x))
        return outputs

logit_clf = LogisticRegression()

In [None]:
# Training loop
epochs = 10
for epoch in range(epochs):
    print(f"Epoch: {epoch}")
    t0 = time()
    total_loss = 0
    # Set the model to training mode
    model.train()

    for step, batch in enumerate(train_dataloader)


### Evaluation

### Results & Interpretation

### References
Alammar, J. (2019). A visual guide to using BERT for the first time. Retrieved from https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

