## Packages and constant declaration

In [43]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchtext
from torchtext import data
from torchtext import datasets
import numpy as np
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split
# from tensorboardX import SummaryWriter
import random
import gc

In [2]:
import matplotlib.pyplot as plt

In [77]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # easy to locate traceback
EMBEDDING_DIM = 300 # embedding vector length
max_word = 200000 # How many unique words to use 
batch_size = 1024#Batch size
device = torch.device('cuda')
random_seed=123
BIDIRECTIONAL= True
HIDDEN_DIM= 32
NUM_LAYERS=2
OUTPUT_DIM=1
DROPOUT=0.5
NUM_EPOCHS = 5

In [4]:
def seed(seed=1000):# Give random seed to everything
#     os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed()

## Data preprocessing

In [7]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean(x): #process the punction
    x=str(x)
    for punct in puncts:
        if punct in x:
            x=x.replace(punct,f'{punct}')
    return x

def clean_numbers(x): # Replace the number with #, make all numbers the same
    if bool(re.search(r'\d',x)):
        x=re.sub('[0-9]{5,}','#####',x)
        x=re.sub('[0-9]{4}','####',x)
        x=re.sub('[0-9]{3}','###',x)
        x=re.sub('[0-9]{2}','##',x)
    return x

# The common abbreviation for some phrase
fullversiondict={"ain't": "is not", "aren't": "are not","can't": "cannot",
                 "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  
                 "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                 "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                 "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                 "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                 "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                 "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                 "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                 "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                 "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", 
                 "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have",
                 "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                 "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will",
                 "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
                 "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
                 "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                 "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 
                 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do',
                 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018',
                 'qouta': 'quota','exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}
def _get_full(fullversiondict):
    full_re = re.compile('(%s)'%'|'.join(fullversiondict.keys()))
    return fullversiondict , full_re # To make things we want to replace single string

fullversiondict,full_re=_get_full(fullversiondict)
def replacetext(text):
    def replace(match): 
        return fullversiondict[match.group(0)]
    return full_re.sub(replace,text) # The full.re here is match, can replace any string now


In [11]:
def read_pre():
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print('Train shape:',train_df.shape)
    print('Test shape:',test_df.shape)
    # Lower case
    train_df['question_text'] = train_df['question_text'].apply(lambda x: x.lower())
    test_df['question_text'] = test_df['question_text'].apply(lambda x: x.lower())
    # Clean punction
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean(x))
    #clean numbers
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))
    #replace abbreviation
    train_df["question_text"] = train_df["question_text"].apply(lambda x: replacetext(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: replacetext(x))
    ## fill up the missing values using next valid value
    train_df["question_text"] = train_df["question_text"].fillna(method="bfill")
    test_df["question_text"] = test_df["question_text"].fillna(method="bfill")
    
#     #Target
#     train_y = train_df['target']
    train_df.to_csv("train_df.csv")
    test_df.to_csv("test_df.csv")
    return
    

In [10]:
# the pretrained embedding model
def load_glove(word_index):
    FILE= 'embeddings/glove.840B.300d//glove.840B.300d.txt'
    def get_coefs(word,*arr):return word, np.asarray(arr, dtype='float32')[:EMBEDDING_DIM]
    embeddings_index = dict(get_coefs(*o.split(' '))for o in open(FILE) if len(o)>300)
    #construct word array pair dictionary
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    nb_words = min(max_word, len(word_index))
    #Initialize a matrix using random value, in case that some words don't exist in our embedding
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words+1,EMBEDDING_DIM))
    #Associate the word in our training set with the embedding model
    for word, i in word_index.items():
        if i>= max_word: continue
        embedding_vector = embeddings_index.get(word) # get the vector for this word form dictionary
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return (embedding_matrix)
##Do same thing to other types of embedding

def load_para(word_index):
    FILE = 'embeddings/paragram_300_sl999-1/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(FILE, encoding="utf8", errors='ignore') if len(o)>300)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()

    nb_words = min(max_word, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words+1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= max_word: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix


In [12]:
read_pre()

Train shape: (1306122, 3)
Test shape: (375806, 2)


In [78]:
train_X = pd.read_csv("train_df.csv")
test_X = pd.read_csv("test_df.csv")

In [79]:
train=train_X[['question_text','target']]

In [80]:
train_X[train_X.target == 0].shape[0]/train_X[train_X.target == 1].shape[0]

15.16287588169781

In [81]:
del train_X
del test_X
gc.collect()

133

In [83]:
train, valid = train_test_split(train,test_size=0.2, random_state=random_seed, stratify = train.target,shuffle=True)
print(f'Num Train: {len(train)}')
print(f'Num Valid: {len(valid)}')

Num Train: 1404820
Num Valid: 351205


In [82]:
# Oversample
attach = train[train.target == 1]
for i in range(11):
    train=pd.concat((train,attach))
train[train.target == 0].shape[0]/train[train.target == 1].shape[0]

1.263572990141484

In [48]:
train.head()

Unnamed: 0,question_text,target
227664,how did the mtv network's audience respond to ...,0
334082,will there be censorship in duckduckgo image s...,0
1072158,why do the majority of people on quora seem to...,0
323157,who invented zero (0) and one (1)?,0
622521,how does the arabic culture differ from the ir...,0


In [66]:
TEXT = data.Field(tokenize = 'spacy', sequential=True, include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)
def get_dataset(csv_data, text_field, label_field):
    fields = [("question_text", text_field), ("target", label_field)]       
    examples = []
    for text, label in zip(csv_data['question_text'], csv_data['target']):
            examples.append(data.Example.fromlist([text, label], fields))
    return examples, fields

In [84]:
train_examples, train_fields = get_dataset(train, TEXT, LABEL)
valid_examples, valid_fields = get_dataset(valid, TEXT, LABEL)
train = data.Dataset(train_examples, train_fields)
valid = data.Dataset(valid_examples, valid_fields)

In [85]:
train_iterator, valid_iterator= data.BucketIterator.splits(
    (train, valid), 
    batch_size = batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.question_text),
    device = device,
    repeat=False)

In [69]:
### Only run at the first time
cache = '.vector_cache'
if not os.path.exists(cache):
    os.mkdir(cache)
vectors = torchtext.vocab.Vectors(name='.vector_cache/glove.840B.300d.txt', cache=cache)

In [86]:
TEXT.build_vocab(train.question_text,
                 max_size=max_word,
                 vectors=vectors,
                 unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train.target)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 162910
Number of classes: 2


In [71]:
del train 
del valid
gc.collect()

20

In [None]:
weight_matrix = TEXT.vocab.vectors

In [73]:
for batch in train_iterator:
    print(f'Text matrix size: {batch.question_text[0].size()}')
    print(f'Target vector size: {batch.target.size()}')
    break

Text matrix size: torch.Size([34, 5000])
Target vector size: torch.Size([5000])


## **Training**

In [32]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), (K, 1) are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
#Data augmentation

In [33]:
def compute_accuracy(model, data_loader):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.question_text
            logits = model(text, text_lengths)
            _, predicted_labels = torch.max(logits, 1)
            num_examples += batch_data.target.size(0)
            correct_pred += (predicted_labels.long() == batch_data.target.long()).sum()
        return correct_pred.float()/num_examples * 100

In [34]:
class RNN(nn.Module):
    def __init__(self):
        
        super().__init__()
        
        self.hidden_dim = (BIDIRECTIONAL+1) * HIDDEN_DIM
        self.num_layers = 2
        
        self.embedding = nn.Embedding(len(TEXT.vocab), EMBEDDING_DIM)
        self.embedding.weight.data.copy_(weight_matrix )
        self.embedding.weight.requires_grad = False
        #The embedding vectors here are not parameter needing update
        self.embedding_dropout = SpatialDropout(0.15)
        #  (Tensor data, Tensor batch_sizes, tuple of Tensors hx, tuple of Tensors params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional)
        self.rnn = nn.LSTM(input_size = EMBEDDING_DIM, 
                           hidden_size = HIDDEN_DIM,
                           num_layers = self.num_layers,
                           bidirectional = BIDIRECTIONAL, 
                           dropout = 0.5)
        self.fc1 = nn.Linear((BIDIRECTIONAL+1) * HIDDEN_DIM, 64)
        self.fc2 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.5)
        self.spatialdrop = SpatialDropout(0.5)
        
        self.weight_W = nn.Parameter(torch.Tensor(self.hidden_dim, self.hidden_dim))
        self.weight_proj = nn.Parameter(torch.Tensor(self.hidden_dim, 1))
        nn.init.uniform_(self.weight_W, -0.1, 0.1)
        nn.init.uniform_(self.weight_proj, -0.1, 0.1)
        
    def forward(self, text, text_length):

        embedded = self.spatialdrop(self.embedding(text))
        packed_output, (hidden, cell) = self.rnn(embedded)
        
        u = torch.tanh(torch.matmul(packed_output, self.weight_W))
        att = torch.matmul(u, self.weight_proj)
        att_score = F.softmax(att, dim=0)
        scored_x = packed_output * att_score
        
        hidden = torch.sum(scored_x, dim=0)
        hidden = self.fc1(hidden)
        hidden = self.dropout(hidden)
        hidden = self.fc2(hidden)
        return hidden

In [35]:
INPUT_DIM = len(TEXT.vocab)

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

torch.manual_seed(random_seed)
model = RNN()
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [87]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_iterator):
        
        text, text_lengths = batch_data.question_text
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_lengths)
#         cost = F.cross_entropy(logits, batch_data.target)
        cost=loss_fn(torch.squeeze(logits),batch_data.target)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 100:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_iterator):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_iterator):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_iterator):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')

Epoch: 001/005 | Batch 000/1372 | Cost: 0.2410
Epoch: 001/005 | Batch 100/1372 | Cost: 0.2398
Epoch: 001/005 | Batch 200/1372 | Cost: 0.2149
Epoch: 001/005 | Batch 300/1372 | Cost: 0.1825
Epoch: 001/005 | Batch 400/1372 | Cost: 0.2406
Epoch: 001/005 | Batch 500/1372 | Cost: 0.2176
Epoch: 001/005 | Batch 600/1372 | Cost: 0.1955
Epoch: 001/005 | Batch 700/1372 | Cost: 0.2203
Epoch: 001/005 | Batch 800/1372 | Cost: 0.2477
Epoch: 001/005 | Batch 900/1372 | Cost: 0.2283
Epoch: 001/005 | Batch 1000/1372 | Cost: 0.2150
Epoch: 001/005 | Batch 1100/1372 | Cost: 0.2134
Epoch: 001/005 | Batch 1200/1372 | Cost: 0.2368
Epoch: 001/005 | Batch 1300/1372 | Cost: 0.2238
training accuracy: 55.82%
valid accuracy: 55.82%
Time elapsed: 1.57 min
Epoch: 002/005 | Batch 000/1372 | Cost: 0.1770
Epoch: 002/005 | Batch 100/1372 | Cost: 0.1956
Epoch: 002/005 | Batch 200/1372 | Cost: 0.2257
Epoch: 002/005 | Batch 300/1372 | Cost: 0.1817
Epoch: 002/005 | Batch 400/1372 | Cost: 0.2313
Epoch: 002/005 | Batch 500/1372

KeyboardInterrupt: 

In [0]:
del model