<a href="https://colab.research.google.com/github/yixish/NLPLearning/blob/master/%E2%80%9CSeq2Seq(Attention).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as Data
import numpy as np

import random
import math
import time

Set the random seeds for reproducability.

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!/opt/bin/nvidia-smi

Thu Nov 19 04:19:21 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [None]:
import pandas as pd 
dir = '/content/gdrive/My Drive/dataset/'
df = pd.read_csv(dir+"Sentiment_Extraction103/train.csv")

Load the German and English spaCy models.

In [None]:
texts = df['text'].values
selected_texts = df['selected_text'].values
num = 4096
texts = texts[:num]
selected_texts = selected_texts[:num]
pairs = []
for i in range(num):
    pairs.append([texts[i],selected_texts[i]])

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
vocab = set()
vocab.add('<sos>')
vocab.add('<eos>')
vocab.add('<pad>')
vocab.add('<unk>')

def build_vocab(vocab,texts):
    max_l = 0
    for text in texts:
        words = nltk.word_tokenize(text)
        if max_l < len(words):
            max_l = len(words)
        for word in words:
            vocab.add(word)
    print(max_l)
build_vocab(vocab,texts)
build_vocab(vocab,selected_texts)
print("vocal size : {}".format(len(vocab)))

52
40
vocal size : 10503


In [None]:
word2idx = { word:i for i,word in enumerate(list(vocab))}
idx2word = { i:word for i,word in enumerate(list(vocab))}                         

In [None]:
n_step = 55
batch_size = 128
n_hidden = 128
emb_dim = 100
vocab_size = len(vocab)

def make_data(seq_data):
    enc_input_all, dec_input_all, dec_output_all = [], [], []

    def word_2_idx(word):
        if word in vocab:
            return word2idx[word]
        else: 
            return word2idx['<unk>']

    for seq in seq_data:
        enc_input = []
        enc_input.append(word2idx['<sos>'])
        enc_input.extend([word_2_idx(n) for n in  nltk.word_tokenize(seq[0])])
        enc_input.append(word2idx['<eos>'])
        dec_input = []
        dec_input.append(word2idx['<sos>'])
        dec_input.extend([word_2_idx(n) for n in nltk.word_tokenize(seq[1])])
        enc_input.append(word2idx['<eos>'])

        dec_output = [word_2_idx(n) for n in nltk.word_tokenize(seq[1])] 
        dec_output.append(word2idx['<eos>'])

        for i in range(n_step - len(enc_input)):
            enc_input.append(word2idx['<pad>'])
        for i in range(n_step - len(dec_input)):
            dec_input.append(word2idx['<pad>'])
        for i in range(n_step - len(dec_output)):
            dec_output.append(word2idx['<pad>'])

        enc_input_all.append(enc_input)
        dec_input_all.append(dec_input)
        dec_output_all.append(dec_output) 

    # make tensor
    return torch.LongTensor(enc_input_all), torch.LongTensor(dec_input_all), torch.LongTensor(dec_output_all)

enc_input_all, dec_input_all, dec_output_all = make_data(pairs)

In [None]:
class TranslateDataSet(Data.Dataset):
    def __init__(self, enc_input_all, dec_input_all, dec_output_all):
        self.enc_input_all = enc_input_all
        self.dec_input_all = dec_input_all
        self.dec_output_all = dec_output_all
    
    def __len__(self): # return dataset size
        return len(self.enc_input_all)
    
    def __getitem__(self, idx):
        return self.enc_input_all[idx], self.dec_input_all[idx], self.dec_output_all[idx]

loader = Data.DataLoader(TranslateDataSet(enc_input_all, dec_input_all, dec_output_all), batch_size, True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

Create the iterators.

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src): 
        '''
        src = [src_len, batch_size]
        '''
        src = src.transpose(0, 1) # src = [batch_size, src_len]

        embedded = self.dropout(self.embedding(src)).transpose(0, 1) # embedded = [src_len, batch_size, emb_dim]

        # enc_output = [src_len, batch_size, hid_dim * num_directions]
        # enc_hidden = [n_layers * num_directions, batch_size, hid_dim]
        enc_output, enc_hidden = self.rnn(embedded) # if h_0 is not give, it will be set 0 acquiescently

        # enc_hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        # enc_output are always from the last layer
        
        # enc_hidden [-2, :, : ] is the last of the forwards RNN 
        # enc_hidden [-1, :, : ] is the last of the backwards RNN
        # initial decoder hidden is final hidden state of the forwards and backwards 
        # encoder RNNs fed through a linear layer
        # s = [batch_size, dec_hid_dim]
        s = torch.tanh(self.fc(torch.cat((enc_hidden[-2,:,:], enc_hidden[-1,:,:]), dim = 1)))
        
        return enc_output, s

In [None]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim, bias=False)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, s, enc_output):
        
        # s = [batch_size, dec_hid_dim]
        # enc_output = [src_len, batch_size, enc_hid_dim * 2]
        
        batch_size = enc_output.shape[1]
        src_len = enc_output.shape[0]
        
        # repeat decoder hidden state src_len times
        # s = [batch_size, src_len, dec_hid_dim]
        # enc_output = [batch_size, src_len, enc_hid_dim * 2]
        s = s.unsqueeze(1).repeat(1, src_len, 1)
        enc_output = enc_output.transpose(0, 1)
        
        # energy = [batch_size, src_len, dec_hid_dim]
        energy = torch.tanh(self.attn(torch.cat((s, enc_output), dim = 2)))
        
        # attention = [batch_size, src_len]
        attention = self.v(energy).squeeze(2)
        
        return F.softmax(attention, dim=1)

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, dec_input, s, enc_output):
             
        # dec_input = [batch_size]
        # s = [batch_size, dec_hid_dim]
        # enc_output = [src_len, batch_size, enc_hid_dim * 2]
        
        dec_input = dec_input.unsqueeze(1) # dec_input = [batch_size, 1]
        
        embedded = self.dropout(self.embedding(dec_input)).transpose(0, 1) # embedded = [1, batch_size, emb_dim]
        
        # a = [batch_size, 1, src_len]  
        a = self.attention(s, enc_output).unsqueeze(1)
        
        # enc_output = [batch_size, src_len, enc_hid_dim * 2]
        enc_output = enc_output.transpose(0, 1)

        # c = [1, batch_size, enc_hid_dim * 2]
        c = torch.bmm(a, enc_output).transpose(0, 1)

        # rnn_input = [1, batch_size, (enc_hid_dim * 2) + emb_dim]
        rnn_input = torch.cat((embedded, c), dim = 2)
            
        # dec_output = [src_len(=1), batch_size, dec_hid_dim]
        # dec_hidden = [n_layers * num_directions, batch_size, dec_hid_dim]
        dec_output, dec_hidden = self.rnn(rnn_input, s.unsqueeze(0))
        
        # embedded = [batch_size, emb_dim]
        # dec_output = [batch_size, dec_hid_dim]
        # c = [batch_size, enc_hid_dim * 2]
        embedded = embedded.squeeze(0)
        dec_output = dec_output.squeeze(0)
        c = c.squeeze(0)
        
        # pred = [batch_size, output_dim]
        pred = self.fc_out(torch.cat((dec_output, c, embedded), dim = 1))
        
        return pred, dec_hidden.squeeze(0)

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        # src = [batch_size,src_len]
        # trg = [batch_size,trg_len]
        # teacher_forcing_ratio is probability to use teacher forcing

        src = src.transpose(0,1)
        trg = trg.transpose(0,1)
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)


        # enc_output is all hidden states of the input sequence, back and forwards
        # s is the final forward and backward hidden states, passed through a linear layer
        enc_output, s = self.encoder(src)
                
        # first input to the decoder is the <sos> tokens
        dec_input = trg[0,:]
        for t in range(1, trg_len):
            
            # insert dec_input token embedding, previous hidden state and all encoder hidden states
            # receive output tensor (predictions) and new hidden state
            dec_output, s = self.decoder(dec_input, s, enc_output)
            
            # place predictions in a tensor holding predictions for each token
            outputs[t] = dec_output
            
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            # get the highest predicted token from our predictions
            top1 = dec_output.argmax(1) 
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            dec_input = trg[t] if teacher_force else top1

        return outputs

In [None]:
# INPUT_DIM = len(SRC.vocab)
# OUTPUT_DIM = len(TRG.vocab)
INPUT_DIM = vocab_size
OUTPUT_DIM  = vocab_size
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
ENC_HID_DIM = 256
DEC_HID_DIM = 256
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
for epoch in range(600):
  for enc_input_batch, dec_input_batch, dec_output_batch in loader:

      (enc_input_batch, dec_intput_batch,dec_output_batch) = (enc_input_batch.to(device), dec_input_batch.to(device),dec_output_batch.to(device))

      # enc_input_batch : [batch_size, seq_len]
      # dec_intput_batch : [batch_size, seq_len]
      pred = model(enc_input_batch,dec_intput_batch,0.5)

      # pred : [seq_len, batch_size, n_class]
      pred = pred.transpose(0, 1) # [batch_size, seq_len, n_class]
      
      loss = 0
      for i in range(len(dec_output_batch)):
          # pred[i] : [n_step+1, n_class]
          # dec_output_batch[i] : [n_step+1]
          loss += criterion(pred[i], dec_intput_batch[i])
      if (epoch + 1) % 20 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
          
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 0020 cost = 99.468361
Epoch: 0020 cost = 120.809807
Epoch: 0020 cost = 131.646545
Epoch: 0020 cost = 126.819580
Epoch: 0020 cost = 99.441719
Epoch: 0020 cost = 106.268974
Epoch: 0020 cost = 112.493080
Epoch: 0020 cost = 103.276535
Epoch: 0020 cost = 107.548080
Epoch: 0020 cost = 126.513344
Epoch: 0020 cost = 106.313904
Epoch: 0020 cost = 106.663177
Epoch: 0020 cost = 120.643242
Epoch: 0020 cost = 111.529442
Epoch: 0020 cost = 122.860901
Epoch: 0020 cost = 102.154327
Epoch: 0020 cost = 111.610115
Epoch: 0020 cost = 97.127991
Epoch: 0020 cost = 114.804085
Epoch: 0020 cost = 110.764473
Epoch: 0020 cost = 101.264679
Epoch: 0020 cost = 139.141113
Epoch: 0020 cost = 103.057610
Epoch: 0020 cost = 115.133331
Epoch: 0020 cost = 128.842957
Epoch: 0020 cost = 105.130196
Epoch: 0020 cost = 117.187737
Epoch: 0020 cost = 125.735260
Epoch: 0020 cost = 111.985741
Epoch: 0020 cost = 117.433884
Epoch: 0020 cost = 114.806725
Epoch: 0020 cost = 130.278015
Epoch: 0040 cost = 64.025581
Epoch: 0040 co

In [None]:
# Test
def translate(word):
    enc_input, dec_input, dec_output = make_data([[word, '']])


    enc_input, dec_input = enc_input.to(device), dec_input.to(device)
    # make hidden shape [num_layers * num_directions, batch_size, n_hidden]
    # hidden = torch.zeros(1, 1, n_hidden).to(device)
    # output = model(enc_input, hidden, dec_input)
    output = model(enc_input, dec_input, 0)

    # output : [n_step+1, batch_size, n_class]

    predict = output.data.max(2, keepdim=True)[1] # select n_class dimension
    decoded = [idx2word[i.item()] for i in predict]
    print(decoded)
    if '<pad>' in decoded:
        translated = ' '.join(decoded[:decoded.index('<pad>')])
        translated = translated.replace('<eos>','')
        return translated
    else:
        return (" ".join(decoded)).replace('<eos>','')

In [None]:
for sent in pairs[:5]:
    print("{} => {}".format(sent[1], translate(sent[0])))

['can`t', 'love', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
love => can`t love
['can`t', 'Starbucks', 'I`m', 'lovin`', 'it', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Starbucks I`m lovin` it => can`t Starbucks I`m lovin` it
['can`t', '.yummmmy', '!', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

In [None]:
test_df = pd.read_csv(dir+"Sentiment_Extraction103/test.csv")
words_voc = ['helped', 'sry',  'scratchy', 'pray', 'fighting', 'aw', 'crazy', 'Won', 'perfect', 'couldnt', 'expected', 'Clever', 'irked', 'excellent', 'luvin', 'losing', 'Gutted', 'busy', 'failing', 'gd', 'bothered', 'attacked', 'rofl', 'scaring', 'shut', 'Good', 'friendly', 'respect', 'nab', 'sux', 'ashamed', 'HORRIBLE', 'Luckily', 'relief', 'stepped', 'hahahaha', 'RIP', 'hell', 'Scared', 'popular', 'gut', 'congrats', 'repair', 'painting', 'freed', 'sadifying', 'barely', 'morning', 'weLcomE', 'Happeh', 'Byebye', 'unfair', 'yaay', 'Aww', 'Addicted', 'accident', 'terrible', 'baffles', 'cramps', 'Also', 'laughs', 'boring', 'yeah', 'deserve', 'fault', 'honored', 'COOL', 'forgets', 'crying', 'welcome', 'amazing', 'funny', 'ave', 'sarcy', 'Hope', 'Awe', 'barked', 'Awsome', 'Burnt', 'brutal', 'hyper', 'healthy', 'bruise', 'nicest', 'harder', 'hard', 'rubbish', 'mourning', 'Friends', 'scary', 'ily', 'heartless', 'rocks', 'Stunning', 'Finally', 'boak', 'painful', 'Sor', 'Sunburned', 'Yummm', 'cooler', 'effective', 'precise', 'chocked', 'banging', 'Chilliin', 'mope', 'Crappy', 'help', 'ouch', 'believe', 'Sorr', 'refuses', 'sukked', 'wrong', 'corrupted', 'pain', 'toothache', 'Neat', 'pissing', 'buried', 'inflating', 'grimmy', 'puke', 'ruined', 'fantastic', 'impostor', 'grateful', 'Awwww', 'problems', 'lovingly', 'panic', 'FORCED', 'dreary', 'lov', 'talented', 'yey', 'wo', 'strong', 'blast', 'poorly', 'Healthy', 'hacked', 'dying', 'SUCK', 'Overheat', 'BAD', 'Liked', 'precious', 'killing', 'heyy', 'thanks', 'strep', 'proudly', 'destroy', 'Yesssssir', 'missin', 'unrelated', 'Lol', 'Wtf', 'sFunF', 'welts', 'Sickkkk', 'lonley', 'confsuing', 'tough', 'low', 'fight', 'Boreedd', 'Bore', 'night', 'hit', 'Honestly', 'wonderful', 'dream', 'Yum', 'hating', 'ow', 'wrecked', 'ENJOY', 'delcious', 'Pinkberry', 'gloomy', 'starbuck', 'SCOREless', 'ultimate', 'ughhhh', 'Thanx', 'awwww', 'WELCOME', 'loosing', 'Morning', 'Enjoyed', 'Thanks', 'Awesome', 'lacking', 'Welcome', 'bask', 'GREAT', 'FUN', 'Scarred', 'Congrats', 'Yeahhhhh', 'Beautiful', 'useless', 'Cutie', 'dang', 'limp', 'failed', 'stop', 'shocked', 'snob', 'heavy', 'YUCK', 'funn', 'Hate', 'troubles', 'Boring', 'partying', 'tyvm', 'thaank', 'fave', 'glorious', 'wonder', 'HEADACHE', 'cried', 'dieing', 'Thanxxx', 'salute', 'hat', 'trouble', 'headaches', 'TY', 'bloody', 'ahmazing', 'bugs', 'sorry', 'Needless', 'enjoys', 'Uggh', 'happy', 'Poor', 'Hooray', 'ugly', 'braggin', 'yaaaaaay', 'ilove', 'celebrate', 'sweeet', 'thx', 'no', 'aww', 'well', 'UPSET', 'omg', 'horrific', 'break', 'yummy', 'warm', 'Stupid', 'annoyed-y', 'disliking', 'uprooted', 'freakin', 'best', 'play', 'sick', 'positive', 'GOOD', 'Blushing', 'impressed', 'Hard', 'fresh', 'sexy', 'acing', 'exhausted', 'annoying', 'food', 'nasty', 'sicker', 'Hahahaha', 'thinking', 'boo', 'rotten', 'Harder', 'Ugh', 'lucky', '-hugs', 'Horrible', 'Dead', 'funniest', 'save', 'boohoohoo', 'fancy', 'fav', 'Bummer', 'Ouch', 'fail', 'headach', 'sosad', 'top', 'faults', 'Loves', 'goood', 'fear', 'boffert', 'sucks', 'easy', 'negative', 'kiss', 'hoping', 'pleeeease', 'injury', 'bettering', 'honest', 'Thanxx', 'helping', 'jealous-', 'silly', 'same', 'thank/', 'shattered', 'bestest', 'wiff', 'likes', 'FAIL', 'Roasting', 'screams', 'cutest', 'disgusted', 'Blessed', 'stressful', 'Awww', 'AWESOME', 'loving', 'inlove', 'Missed', 'separates', 'Yuuum', 'toughest', 'cancelled', 'thanx', 'bummer', 'fool', 'lammmeeee', 'pooooor', 'blunt', 'worse', 'suck', 'wow', 'Luv', 'Sorry', 'cold', 'dead', 'Refreshed', 'denied', 'disagree', 'Hugs', 'Sweet', 'yum', 'thnks', 'burned', 'glamorous', 'survive', 'buggin', 'Link', 'annoyed', 'difficult', 'hurting', 'exciting', 'cheer', 'thankful', 'delayed', 'cheered', 'argue', 'bother', 'laughing', 'AWFUL', 'please', 'nothing', 'pisses', 'Kudos', 'cruel', 'amazes', 'heat', 'homesick', 'illogical', 'though', 'urgh', 'happpy', 'blew', 'piss', 'oops', 'perfectly', 'Use', 'bitbetter', 'defective', 'lovin', 'energy', 'OMFG', '_violence', 'sickness', 'dismal', 'down', 'recommend', 'working', 'bleeds', 'warmly', 'spilled', 'glad', 'sore', 'LOVED', 'Hurray', 'yep', 'amaze', 'ignoring', 'Wishes', 'beasted', 'cries', 'Favorite', 'headache', 'sunburnt', 'fcking', 'yay', 'honor', 'cool', 'hopefuly', 'wish', 'HATE', 'ANGRY', 'fml', 'pricey', 'weirdos', 'accidents', 'miserable', 'dammit', 'blessed', 'slow', 'Thankyou', 'Annoyed', 'HAPP', 'excited', 'lose', 'homework', 'thankies', 'clean', 'comfy', 'suffering', 'credit', 'luvd', 'surprised', 'Foolish', 'pity', 'reunited', 'Goood', 'gooood', 'abuse', 'hungry', 'better', 'Liking', 'Gratiss', 'thanxx', 'awesomest', 'Pissed', 'booted', 'bricked', 'gutted', 'cry', 'sober', 'DANG', 'yayay', 'killed', 'douchebag', 'agitated', 'spammer', 'Muses', 'pleasure-', 'unable', 'bad', 'killen', 'goodness', 'spamming', 'slacking', 'stressed', 'willing', 'Loving', 'DIE', 'fabulous', 'Enjoy', 'burning', 'ing', 'tricked', 'sunny', 'shitt', 'Haha', 'writing', 'loose', 'effed', 'poked', 'robbed', 'coolest', 'Cried', 'destroys', 'confused', 'strikes', 'lovely', 'misses', 'loss', 'shitttt', 'win', 'stuffed', 'excuse', 'fans', 'boredom', 'sadly', 'priceless', 'hater', 'funnnn', 'spammers', 'lazy', 'angry', 'Stressed', 'Worried', 'goo', 'sense', 'good', 'feel', 'Upset', 'Rejected', 'ROCKED', 'filthy', 'Sucks', 'worry', 'waste', 'ugh', 'die', 'ill', 'Mad', 'afraid', 'Cute', 'soooory', 'NICE', 'sucked', 'careless', 'Awaiting', 'avoid', 'underpaid', 'Happy', 'true', 'LOVES', 'dragged', 'jealous', 'Hitting', 'Glad', 'Wishing', 'worth', 'nooooooo', 'hate', 'Gorgeous', 'distorted', 'hurtin', 'anymore', 'awww', 'shucks', 'laavly', 'soft', 'saddest', 'haha', 'busted', 'restful', 'recharge', 'bahaha', 'Goodnit', 'awwwww', 'smack', 'nervous', 'the', 'dread', 'Gudluck', 'suckss', 'hahah', 'retarded', 'stuck', 'relaxing', 'Miss', 'dancing', 'lie', 'rough', 'cheers', 'grand', 'unhappy', 'spoiled', 'stopped', 'stronger', 'Loved', 'loves', 'proud', 'endure', 'chilly', 'doomed', 'endearing', 'loll', 'Sad', 'sadd', 'forgot', 'wtf', 'sinking', 'attacking', 'Rejecting', 'breaks', 'friends', 'pleasure', 'Bliss', 'crashed', 'hapee', 'hehe', 'lies', 'touche', 'death', 'blatently', 'Forgive', 'hopefully', 'Freckles', 'WISH', 'laughed', 'kill', 'unny', 'blocked', 'Prayin', 'sowy', 'missing', 'Creased', 'lonesome', 'Chilling', 'Wish', 'decrease', 'ace', 'smiles', 'collapses', 'commands', 'beating', 'hurt', 'evil', 'Looking', 'yes', 'Yayy', 'BAH', 'messed', 'success', 'Trouble', 'safe', 'Yay', 'bruised', 'ADORE', 'misplaced', 'GOODNIGHT', 'OMGSH', 'hurts', 'sorted', 'upset', 'cramping', 'nicer', 'borin', 'adorable', 'brave', 'Thx', 'grrrrrrr', 'Funeral', 'darn', 'crush', 'Killed', 'infection', 'miss', 'awfully', 'nerd', 'ruin', 'liked', '-sorry', 'like', 'Lameness', 'Yayyyyyyy', 'saddens', 'lonely', 'starving', 'support', 'steal', 'addicted', 'woops', 'scariest', 'Tired', 'love', 'promise', 'lol', 'unlucky', 'disease', 'deceiving', 'victims', 'mayyyybe', 'Itchy', 'argh', 'twisted', 'goodluck', 'guilty', 'crisps', 'bullied', 'anxiety', 'witty', 'specials', 'awesome', 'missed', 'gmail', 'THANKS', 'slower', 'obsessed', 'beauty', 'Hopefully', 'develop', 'drained', 'FML', 'poo', 'Hey', 'texting', 'Missing', 'goooooood', 'HELP', 'eww', 'Hangover', 'Cleaning', 'bestie', 'error', 'favorite', 'happiest', 'broke', 'dangerous', 'Relaxing', 'Alas', 'storming', 'envy', 'trending', 'handy', 'easier', 'Best', 'stupid', '_Uh_Knee', 'HAHAHA', 'mean', 'Hahaha', 'Lovely', 'SORRY', 'bummed', 'Bleh', 'sucking', 'worst', 'beautiful', 'Chillin', 'nsty', 'real', 'nicley', 'Fab', 'quashed', 'doable', 'lunch', 'chill', 'weird', 'Congrat', 'trashed', 'cant', 'baddd', 'dam', 'Defeated', 'scared', 'LOVE', 'rejected', 'sweet', 'imo', 'believes', 'idiot', 'charged', 'nan', 'Cool', 'mad', 'handsome', 'crappy', 'picnic', 'Honored', 'stolen', 'Hilarious', 'chillin', 'Bad', 'Hurt', 'luv', 'indeed', 'thank', 'Urgh', 'cancel', 'wishes', 'Goodnight', 'looooove', 'flu', 'pleased', 'fo', 'damned', 'bulky', 'depressed', 'enjoy', 'rain', 'but', 'hates', 'Superman', 'right', 'Perfect', 'awful', 'Boredom', 'sukks', 'Let-Down', 'banged', 'depress', 'hitting', 'Delayed', 'sunburn', 'Happ', 'Suffering', 'WORST', 'lack', 'ScREW', 'damp', 'won', 'great', 'Thank', 'care', 'Hurrah', 'escaped', 'Hoping', 'Fabulous', 'rock', 'Tattered', 'fmlllll', 'Mourning', 'disproves', 'WTF', 'tired', 'offered', 'pinched', 'Fun', 'Respect', 'disturbed', 'gift', 'Great', 'Yayyy', 'Howdyyy', 'hopin', 'Regrettin', 'scarce', 'never', 'Unlucky', 'smile', 'rocked', 'shame', 'caught', 'MISS', 'greatest', 'worried', 'badly', 'arghhhh', 'fine', 'joys', 'greater', 'Rocks', 'concur', 'Bored', 'laugh', 'stole', 'sad', 'bored', 'ache', 'Oops', 'Hurts', 'neat', 'goodbye', 'Hates', 'mistake', 'spooky', 'passed', 'Sorrry', 'fun', 'nauseous', 'SUUUKS', 'bless', 'hiccups', 'regret', 'downside', 'Excited', 'hope', 'fab', 'thnx', 'misse', 'sprawled', 'Sadly', 'painfully', 'grouchy', 'hardly', 'Yessir', 'burnt', 'brilliant', 'thankyou', 'Anytime', 'winner', 'sadder', 'aches', 'YAY', 'joke', 'hateeee', 'niceee', 'Excellent', 'lame', 'ughhh', 'special', 'cute', 'promises', 'adoarble', 'CONGRATS', 'haaaate', 'iloveyou', 'goodgirl', 'Worse', 'dirty', 'loner', 'screwing', 'PRIDE', 'whoops', 'allergic', 'hopes', 'hahaha', 'coolio', 'ignore', 'standard', 'Pretty', 'Like', 'jummy', 'SIGH', 'broken', 'Wonderful', 'LOVING', 'fond', 'Love', 'lost', 'Nope', 'poor', 'canceled', 'favourite', 'patient', 'Thanxs', '_it_good', 'SUCKS', 'goodb', 'smeared', 'coooolest', 'pathetic', 'nicely', 'upgraded', 'Sick', 'uuuugh', 'May', 'Exhausted', 'weak', 'smart', 'idiots', 'doubtful', 'sadness', 'damm', '/agrees', 'awsome', 'nice', 'mess', 'forget', 'Yeah', 'ion', 'wishing', 'crooning', 'loved', 'liking', 'gorgeous', 'honour', 'Safe', 'enjoyed', 'EXCELLENT', 'snappy', 'smelly', 'freaked', 'late', 'Dang', 'oww', 'Lmfao', 'Nice', 'dropped', 'ME', 'Amazing', 'delicious', 'Clean', 'Sadness', 'EVIL', 'died', 'leaving', 'humble', 'pumped', 'illness', 'goodnight', 'pretty', 'problem', 'heartburn', 'super', 'ditch', 'Freaking', 'nonsense', 'yummmm', 'boooooo', 'Dumb', 'besties', 'dizzy', 'Goooood', 'AMAZING', 'anxious', 'Enjoying', 'squirted', 'SAD', 'COOLEST', 'enjoying', 'horrible', 'kicked', 'How', 'Fantastic', 'scare', 'expensive', 'Stuck', 'HAPPY', 'Problem', 'foad']

def translate_(text):
    if len(text.split(' '))>10:
        for word in words_voc:
            if word in text.split(' '):
                return word
    return text

    # target = translate(text)
    # return target
    # words = nltk.word_tokenize(text)
    # if target in words:
    #     if target in words_voc:
    #         return target
    #     index = words.index(target)
    #     words = words[index:]
    #     return " ".join(words)
    # else:
    #     return text
test_df['res'] = test_df['text'].map(lambda x:translate_(x))
test_df = test_df.drop(columns = ['text','sentiment'])
# test_df = test_df.drop(columns = ['sentiment'])

In [None]:
test_df.head(20)

Unnamed: 0,id,res
0,0,I just saw a shooting star... I made my wish
1,1,best
2,2,exciting
3,3,i`ve been eating cheetos all morning..
4,4,thanks
5,5,CONGRATS on graduating college!
6,6,"loved, but hated driving in pollution"
7,7,In weho! They`re are playing a lot of brit
8,8,_NJ Oh! I`m only in my 7 I just joined Twitt...
9,9,wish


In [None]:
test_df.to_csv(dir+'1117.csv',header=None,index=None)

In [None]:
print(idx2word[243])

<eos>
