<a href="https://colab.research.google.com/github/zizilnam/Mini_Project_NLP_Pair_Sentence_Kaggle/blob/main/STS_W2V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer
nltk.download('stopwords')
from gensim.models import FastText

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## DataSet

In [None]:
%cd /content/drive/MyDrive/NLP-Project/quora-question-pairs

/content/drive/MyDrive/NLP-Project/quora-question-pairs


In [None]:
train = pd.read_csv("train.csv")
train = train.dropna()
train.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

## Preprocessing

In [None]:
def pair_to_sequence(data):
    
    return list(data["question1"]) + list(data["question2"])

def text_preprocessing(text, tokenizer):
    text = re.sub("[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]", "", text)

    tokens = tokenizer().tokenize(text)
    
    stopwords = nltk.corpus.stopwords
    SW = set(stopwords.words("english"))

    result = [token for token in tokens if token not in SW]

    return result

In [None]:
tokenizer = TreebankWordTokenizer

corpus = pair_to_sequence(train)
corpus = [text_preprocessing(question, tokenizer) for question in corpus]
corpus[:4]

[['What', 'step', 'step', 'guide', 'invest', 'share', 'market', 'india'],
 ['What', 'story', 'Kohinoor', 'KohiNoor', 'Diamond'],
 ['How', 'I', 'increase', 'speed', 'internet', 'connection', 'using', 'VPN'],
 ['Why', 'I', 'mentally', 'lonely', 'How', 'I', 'solve']]

In [None]:
model = FastText(size=50, window=5, min_count=2, workers=4, sg=1)
model.build_vocab(sentences=corpus)
model.train(sentences=corpus,
            total_examples=len(corpus),
            epochs=10)

In [None]:
model.save("fasttext_model_1")

In [None]:
model.most_similar("internet")

  """Entry point for launching an IPython kernel.


[('techinternet', 0.9393445253372192),
 ('internetorg', 0.9087850451469421),
 ('textinginternet', 0.8812063336372375),
 ('Internet', 0.8622018694877625),
 ('internetonline', 0.8580065369606018),
 ('Betternet', 0.792997419834137),
 ('telnet', 0.783061146736145),
 ('browsing', 0.7623828053474426),
 ('freepaid', 0.7520067095756531),
 ('via', 0.7476982474327087)]

# Siamese Network and Ma-LSTM

## Training

In [None]:
import numpy as np
import pandas as pd
import re

import torch
import gensim
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, WeightedRandomSampler, SequentialSampler
from torch.autograd import Variable

import warnings
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print("Device: ", device)

Device:  cuda:0


In [None]:
def text_preprocessing(text, tokenizer):
    text = re.sub("[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]", "", text)

    tokens = tokenizer().tokenize(text)
    
    stopwords = nltk.corpus.stopwords
    SW = set(stopwords.words("english"))

    result = [token for token in tokens if token not in SW]

    return " ".join(result).strip()

In [None]:
tokenizer = TreebankWordTokenizer
tokenizer().tokenize("don't")

['do', "n't"]

In [None]:
from gensim.models import Word2Vec
from gensim.models import FastText

model_w2v = Word2Vec.load("w2v_model_1")
model_ft  = FastText.load("fasttext_model_2")
train = pd.read_csv("train.csv")

In [None]:
print(len(model_ft.wv.vocab))
print(len(model_w2v.wv.vocab))

70523
70523


In [None]:
train = train.dropna()

In [None]:
train_questions_pair = []
train_labels = []
for _, row in train.iterrows():
    # dataframe을 반복하면서, sentences1, sentences2, label을 리스트에 저장합니다.
    # (sent1, sent2)의 tuple을 담는 train_questions_pair와 label을 담는 train_labels를 만들어보세요.

    q1 = text_preprocessing(row["question1"], tokenizer)
    q2 = text_preprocessing(row["question2"], tokenizer)
    label = row["is_duplicate"]

    if q1 and q2:
        train_questions_pair.append((q1, q2))
        train_labels.append(label)

print('Train Data Question Pairs: ', len(train_questions_pair))

Train Data Question Pairs:  404268


In [None]:
class Language:
    """
    데이터의 단어들과 그에 해당하는 index를 저장하는 구조를 만듭니다.
    """

    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words + 1
            self.word2count[word] = 1
            self.index2word[self.n_words + 1] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# df_train에 있는 모든 단어들을 word2index, word2count, index2word에 추가합니다.
language = Language()
for data in [train_questions_pair]:
    for question_pair in data: # (sent1, sent2)
        q1 = question_pair[0]
        q2 = question_pair[1]
        language.addSentence(q1)
        language.addSentence(q2)

In [None]:
class QuestionsDataset(Dataset):
    """
    입력 문장에 해당하는 Pair와 Label을 찾아주는 QuestionsDataset 클래스를 구현합니다.
    """

    def __init__(self, questions_list, word2index, labels):
        self.questions_list = questions_list
        self.labels = labels
        self.word2index = word2index
        
    def __len__(self):
        return len(self.questions_list)
    
    def __getitem__(self, index):
        questions_pair = self.questions_list[index]
        q1 = questions_pair[0]
        q1_indices = []
        for word in q1.split():
            # 나는 밥을 먹었다 -> 나 밥 먹
            # [3, 10, 12]
            q1_indices.append(self.word2index[word])
            
        q2 = question_pair[1]
        q2_indices = []
        for word in q2.split():
            q2_indices.append(self.word2index[word])
            
        # q1_indices and q2_indices are lists of indices against words used in the sentence 
        return q1_indices, q2_indices, self.labels[index]
    
train_dataset = QuestionsDataset(train_questions_pair, language.word2index, train_labels)

In [None]:
n_vocab = len(language.word2index)
print ('Total Unique Vocabulary Tokens: ', n_vocab)

Total Unique Vocabulary Tokens:  134038


In [None]:
class CustomCollate:
    """
    RNN에서 padding과 packing을 할 때 필요한 정보를 맞춰주는 Collate 함수를 구현합니다.
    collate_fn은 batch 단위로 index를 가져와서 합칠 때 필요합니다.
    """
    def custom_collate(self, batch):
        # batch = list of tuples where each tuple is of the form ([i1, i2, i3], [j1, j2, j3], label)
        q1_list = []
        q2_list = []
        labels = []
        for training_example in batch: # batch_size = 32
            q1_list.append(training_example[0])
            q2_list.append(training_example[1])
            labels.append(training_example[2])
          
        q1_lengths = [len(q) for q in q1_list] # [3, 5, 8, 10, 3, 5, ....]
        q2_lengths = [len(q) for q in q2_list] # [5, 4, 10, 11, 6, 4, ....]
        
        return q1_list, q1_lengths, q2_list, q2_lengths, labels

    def __call__(self, batch):
        return self.custom_collate(batch)

In [None]:
embed_dim = 50  # word2vec dim
hidden_size = 50 # LSTM number of hidden layer node
num_layers = 1 # LSTM layers
learning_rate = 0.0005
epochs = 100
print_iter = 100 # iteration당 출력
batch_size = 64

In [None]:
validation_split = 0.2
dataset_size = len(train_dataset)
indices = list(range(dataset_size))
split = int(np.floor((1 - validation_split) * dataset_size)) # 뒤에서 20%에 해당하는 index
shuffle_dataset = True
random_seed = 42
#0xC0FFEE
if shuffle_dataset :
    np.random.seed(random_seed)
    torch.seed = random_seed
    np.random.shuffle(indices) # random shuffle된 index list.

# training, validation index setting
train_indices, val_indices = indices[:split], indices[split:]

# batch training과 batch inference를 하기 위해서 DataLoader를 구현합니다.
train_sampler = SubsetRandomSampler(train_indices) # batch 단위로 random으로 데이터셋을 불러오고 싶을 때.
validation_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, collate_fn=CustomCollate())
val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=validation_sampler, collate_fn=CustomCollate())

print ('Training Set Size {}, Validation Set Size {}'.format(len(train_indices), len(val_indices)))

Training Set Size 323414, Validation Set Size 80854


In [None]:
from gensim.models import Word2Vec
w2v_weights = torch.FloatTensor(model_w2v.wv.vectors) 

# Create a random weight tensor of the shape (n_vocab + 1, EMBEDDING_DIM) and place each word's embedding from word2vec at the index assigned to that word
# 2 key points:
# 1. Weights tensor has been initialized randomly so that the words which are part of our dataset vocab but are not present in word2vec are given.
# 2. Embedding at 0 index is all zeros. This is the embedding for the padding that we will do for batch processing
weights = torch.randn(n_vocab+1, embed_dim) # 5043+1 * 50
weights[0] = torch.zeros(embed_dim)         # [0, ....]

# (word, word_index)
for word, lang_word_index in language.word2index.items(): # word2vec word index != word2index
    if word in model_w2v:
        weights[lang_word_index] = torch.FloatTensor(model_w2v.wv.get_vector(word)) # embedding lookup

del model_ft
del w2v_weights

  del sys.path[0]
  


In [None]:
class SiameseNetwork(nn.Module):
    # 위에 있는 SiameseNetwork를 (일부만) 구현해봅시다.
    def __init__(self, pretrained_weights):
        super(SiameseNetwork, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_weights) # weights (5044, 50)
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers,
                            batch_first=True)

    def exponent_neg_manhattan_distance(self, x1, x2):
        # L1 term 계산
        return torch.exp(-torch.sum(torch.abs(x1-x2), dim=0)).to(device)

    # LSTM(a) 또는 LSTM(b) 같은 한쪽 단일 LSTM에 feed-forward를 실행하는 함수.
    def forward_once(self, x, input_lengths):
        # x = (batch_dim, sequence)
        # x = [
        #      [i1, i2, i3],
        #      [j1, j2, j3, j4]
        # ]
        # input_lengths = [3, 4]

        # Reverse sequence lengths indices in decreasing order as per the requirement from PyTorch before Padding and Packing
        sorted_indices = np.flipud(np.argsort(input_lengths))
        input_lengths = np.flipud(np.sort(input_lengths))
        input_lengths = input_lengths.copy()

        # Reorder questions in the decreasing order of their lengths
        ordered_questions = [torch.LongTensor(x[i]).to(device) for i in sorted_indices]
        # Pad sequences with 0s to the max length sequence in the batch
        ordered_questions = torch.nn.utils.rnn.pad_sequence(ordered_questions, batch_first=True)
        # Retrieve embeddings
        embeddings = self.embedding(ordered_questions).to(device)
        # Pack the padded sequences and pass it through LSTM
        packed = torch.nn.utils.rnn.pack_padded_sequence(embeddings, input_lengths, batch_first=True)
        out, (hn, cn) = self.lstm(packed)
        unpacked, unpacked_len = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True, total_length=int(input_lengths[0]))

        # The following step reorders the calculated activations to the original order in which questions were passed
        result = torch.FloatTensor(unpacked.size())
        for i, encoded_matrix in enumerate(unpacked):
            result[sorted_indices[i]] = encoded_matrix
        return result

    # MaLSTM feed-forward
    def forward(self, q1, q1_length, q2, q2_length):
        output1 = self.forward_once(q1, q1_length) # h3(a)
        output2 = self.forward_once(q2, q2_length) # h4(b)
        similarity_score = torch.zeros(output1.size()[0]).to(device)
        # Calculate Similarity Score between both questions in a single pair
        for index in range(output1.size()[0]):
            # Sequence lengths are being used to index and retrieve the activations before the zero padding since they were not part of original question
            q1 = output1[index, q1_length[index] - 1, :]
            q2 = output2[index, q2_length[index] - 1, :]
            similarity_score[index] = self.exponent_neg_manhattan_distance(q1, q2) # score
        return similarity_score

model = SiameseNetwork(weights).to(device)

In [None]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
total_step = len(train_loader)
threshold = torch.Tensor([0.5]).to(device)

for epoch in range(epochs):
    losses = []
    model.train(True)
    train_correct = 0

    for i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels) in enumerate(train_loader):

        labels = torch.FloatTensor(labels).to(device)
        optimizer.zero_grad()

        #print(q1_batch_lengths, q2_batch_lengths)
        similarity_score = model(q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths)
        predictions = (similarity_score > threshold).float() * 1
        total = labels.size()[0]
        correct = (predictions == labels).sum().item()
        train_correct += correct
        
        loss = loss_fn(similarity_score, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % print_iter == 0:
            losses.append(loss.item())
            print(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{total_step}], Loss: {np.mean(losses):.4}, Accuracy: {(correct/total)*100:.4}")
    
    print(f"Training Loss: {np.mean(losses):.4f}, Training Accuracy: {((train_correct / len(train_indices)) * 100):.4f}")

    model.train(False)
    val_correct = 0
    with torch.no_grad():
        for i, (q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths, labels) in enumerate(val_loader):
            labels = torch.FloatTensor(labels).to(device)
            similarity_score = model(q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths)
            predictions = (similarity_score > threshold).float() * 1
            total = labels.size()[0]
            correct = (predictions == labels).sum().item()
            val_correct += correct
          
        avg_acc_val = val_correct * 100 / len(val_indices)
        print(f"Validation Set Size {len(val_indices)}, Correct in Validation {val_correct}, Validation Accuracy {avg_acc_val:2f}")

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Epoch [4/100], Step [4500/5054], Loss: 0.1789, Accuracy: 79.69
Epoch [4/100], Step [4600/5054], Loss: 0.1796, Accuracy: 65.62
Epoch [4/100], Step [4700/5054], Loss: 0.1792, Accuracy: 73.44
Epoch [4/100], Step [4800/5054], Loss: 0.1792, Accuracy: 73.44
Epoch [4/100], Step [4900/5054], Loss: 0.1788, Accuracy: 76.56
Epoch [4/100], Step [5000/5054], Loss: 0.1785, Accuracy: 73.44
Training Loss: 0.1785, Training Accuracy: 73.5679
Validation Set Size 80854, Correct in Validation 59476, Validation Accuracy 73.559750
Epoch [5/100], Step [100/5054], Loss: 0.1743, Accuracy: 73.44
Epoch [5/100], Step [200/5054], Loss: 0.1913, Accuracy: 65.62
Epoch [5/100], Step [300/5054], Loss: 0.2028, Accuracy: 65.62
Epoch [5/100], Step [400/5054], Loss: 0.193, Accuracy: 75.0
Epoch [5/100], Step [500/5054], Loss: 0.1913, Accuracy: 73.44
Epoch [5/100], Step [600/5054], Loss: 0.1874, Accuracy: 78.12
Epoch [5/100], Step [700/5054], Loss: 0.1849, Accuracy: 75.0
Epoch

In [None]:
torch.save(model, "model_word2vec_siam.pt")

## Test 

In [None]:
test = pd.read_csv("test.csv")

In [None]:
test['question1'].fillna(method='ffill', inplace=True)
test['question2'].fillna(method='ffill', inplace=True)
test.isnull().sum()

test_id      0
question1    0
question2    0
dtype: int64

In [None]:
def text_preprocessing(text, tokenizer):
    text = re.sub("[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]", "", text)

    tokens = tokenizer().tokenize(text)
    
    stopwords = nltk.corpus.stopwords
    SW = set(stopwords.words("english"))

    result = [token for token in tokens if token in model_w2v.wv.vocab.keys()]

    return " ".join(result).strip()

In [None]:
test_questions_pair = []
for _, row in test.iterrows():
    # dataframe을 반복하면서, sentences1, sentences2, label을 리스트에 저장합니다.
    # (sent1, sent2)의 tuple을 담는 train_questions_pair와 label을 담는 train_labels를 만들어보세요.

    q1 = text_preprocessing(row["question1"], tokenizer)
    q2 = text_preprocessing(row["question2"], tokenizer)
    

    if q1 and q2:
        test_questions_pair.append((q1, q2))
        

print('Test Data Question Pairs: ', len(test_questions_pair))

Test Data Question Pairs:  2345511


In [None]:
for data in [test_questions_pair]:
    for question_pair in data: # (sent1, sent2)
        q1 = question_pair[0]
        q2 = question_pair[1]
        language.addSentence(q1)
        language.addSentence(q2)

In [None]:
class QuestionsDataset(Dataset):
    """
    입력 문장에 해당하는 Pair와 Label을 찾아주는 QuestionsDataset 클래스를 구현합니다.
    """

    def __init__(self, questions_list, word2index):
        self.questions_list = questions_list
        #self.labels = labels
        self.word2index = word2index
        
    def __len__(self):
        return len(self.questions_list)
    
    def __getitem__(self, index):
        questions_pair = self.questions_list[index]
        q1 = questions_pair[0]
        q1_indices = []
        for word in q1.split():
            # 나는 밥을 먹었다 -> 나 밥 먹
            # [3, 10, 12]
            q1_indices.append(self.word2index[word])
            
        q2 = question_pair[1]
        q2_indices = []
        for word in q2.split():
            q2_indices.append(self.word2index[word])
            
        # q1_indices and q2_indices are lists of indices against words used in the sentence 
        return q1_indices, q2_indices#, self.labels[index]
    
test_dataset = QuestionsDataset(test_questions_pair, language.word2index)

In [None]:
n_vocab = len(language.word2index)

In [None]:
n_vocab

134038

In [None]:
class CustomCollate:
    """
    RNN에서 padding과 packing을 할 때 필요한 정보를 맞춰주는 Collate 함수를 구현합니다.
    collate_fn은 batch 단위로 index를 가져와서 합칠 때 필요합니다.
    """
    def custom_collate(self, batch):
        # batch = list of tuples where each tuple is of the form ([i1, i2, i3], [j1, j2, j3], label)
        q1_list = []
        q2_list = []
        
        for training_example in batch: # batch_size = 32
            q1_list.append(training_example[0])
            q2_list.append(training_example[1])
            
          
        q1_lengths = [len(q) for q in q1_list] # [3, 5, 8, 10, 3, 5, ....]
        q2_lengths = [len(q) for q in q2_list] # [5, 4, 10, 11, 6, 4, ....]
        
        return q1_list, q1_lengths, q2_list, q2_lengths

    def __call__(self, batch):
        return self.custom_collate(batch)

In [None]:
dataset_size = len(test_dataset)
indices = list(range(dataset_size))
# split = int(np.floor((1 - validation_split) * dataset_size)) # 뒤에서 20%에 해당하는 index
#shuffle_dataset = True
# random_seed = 42
#0xC0FFEE
# if shuffle_dataset :
#     np.random.seed(random_seed)
#     torch.seed = random_seed
#     np.random.shuffle(indices) # random shuffle된 index list.

# training, validation index setting
# train_indices, val_indices = indices[:split], indices[split:]

# batch training과 batch inference를 하기 위해서 DataLoader를 구현합니다.
train_sampler = SubsetRandomSampler(train_indices) # batch 단위로 random으로 데이터셋을 불러오고 싶을 때.
validation_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(indices)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          sampler=test_sampler,
                                           batch_size=batch_size,                                           
                                           collate_fn=CustomCollate())


In [None]:
w2v_weights = torch.FloatTensor(model_ft.wv.vectors) 

# Create a random weight tensor of the shape (n_vocab + 1, EMBEDDING_DIM) and place each word's embedding from word2vec at the index assigned to that word
# 2 key points:
# 1. Weights tensor has been initialized randomly so that the words which are part of our dataset vocab but are not present in word2vec are given.
# 2. Embedding at 0 index is all zeros. This is the embedding for the padding that we will do for batch processing
weights = torch.randn(n_vocab+1, embed_dim) # 5043+1 * 50
weights[0] = torch.zeros(embed_dim)         # [0, ....]

# (word, word_index)
for word, lang_word_index in language.word2index.items(): # word2vec word index != word2index
    if word in model_ft:
        weights[lang_word_index] = torch.FloatTensor(model_ft.wv.get_vector(word)) # embedding lookup

  if sys.path[0] == '':


In [None]:
print(test_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7fa06d08d150>


In [None]:
q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths = test_loader
similarity_score = model(q1_batch, q1_batch_lengths, q2_batch, q2_batch_lengths)
predictions = (similarity_score > threshold).float() * 1
    

ValueError: ignored

In [None]:
model

In [None]:
submission = test
submission['is_duplicate'] = test_pred
submission.drop(['question1', 'question2'], axis=1, inplace=True)
submission.to_csv("first_submission")
submission.head()