In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from evaluate import *

In [3]:
import pandas as pd
import numpy as np
import time
import spacy
import random
from pathlib import Path
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data 
import torchtext
import csv
from itertools import islice
import nltk
nltk.download('punkt')
from nltk.tokenize import wordpunct_tokenize
import regex as re
import pickle
from collections import Counter
from tqdm import tqdm
import itertools
import torch.utils.data as data
from sklearn.model_selection import train_test_split
import math
from datetime import datetime

# Setting device on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

# Set the random seed manually for reproducibility.
torch.manual_seed(1234)

path = "drive/My Drive/FiQA/"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using device: cuda

Tesla P100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [0]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
    
def pre_process(doc):
    doc = str(doc)
    x = re.sub('[…“”%!&"@#()\-\*\+,/:;<=>?@[\]\^_`{\}~]', ' ', doc)
    y = re.sub('[\.\']', "", x)
    z = y.lower()
    return z

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(path, data):
    with open(path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def pad_seq(seq, max_seq_len):
    # Pad each seq to be the same length to process in batch.
    # pad_token = 0
    if len(seq) >= max_seq_len:
        seq = seq[:max_seq_len]
    else:
        seq += [0]*(max_seq_len - len(seq))
    return seq

def vectorize(seq, vocab, max_seq_len):
    # Map tokens in seq to idx
    seq_idx = [vocab[token] for token in seq]
    # Pad seq idx
    padded_seq_idx = [pad_seq(seq_idx, max_seq_len)]
    # padded_seq_idx = pad_seq(seq_idx, max_seq_len)

    # return torch.tensor(padded_seq_idx)
    return padded_seq_idx

In [0]:
# qid_docid = pd.read_csv(path + "FiQA_train_question_doc_final.tsv", sep="\t")
# qid_docid = qid_docid [['qid', 'docid']]

**Load pickle files**

In [0]:
# dict mapping of token to idx
vocab = load_pickle(path + 'vocab_full.pickle')
# dict mapping of docid to doc text
docid_to_text = load_pickle(path + 'label_ans.pickle')

# dict mapping of qid to question text
qid_to_text = load_pickle(path + 'qid_text.pickle')

train_qid_rel = load_pickle(path + "qid_rel_train.pickle")
test_qid_rel = load_pickle(path + "qid_rel_test.pickle")
valid_qid_rel = load_pickle(path + "qid_rel_valid.pickle")

train_set = load_pickle(path + 'data/data_train_50.pickle')
valid_set = load_pickle(path + 'data/data_valid_50.pickle')

# train_set = load_pickle(path + 'data/data_train_100.pickle')
# valid_set = load_pickle(path + 'data/data_valid_100.pickle')

# train_set = load_pickle(path + 'data/data_train_200.pickle')
# valid_set = load_pickle(path + 'data/data_valid_200.pickle')

test_set = load_pickle(path + 'data/data_test_500_rel.pickle')
test_set_full = load_pickle(path + 'data/data_test_500.pickle')

empty_docs = load_pickle(path+'empty_docs.pickle')

In [7]:
train_set = [x for x in train_set if x[1] not in empty_docs]
valid_set = [x for x in valid_set if x[1] not in empty_docs]

def remove_empty(test_set):
    for index, row in enumerate(test_set):
        for doc in row[1]:
            if doc in empty_docs:
                del test_set[index]
    return test_set

test_set = remove_empty(test_set)
test_set_full = remove_empty(test_set_full)

print("Number of training samples: {}".format(len(train_set)))
print("Number of validation samples: {}".format(len(valid_set)))
print("Number of test samples: {}".format(len(test_set)))

Number of training samples: 283707
Number of validation samples: 31582
Number of test samples: 330


**Model**

In [0]:
emb_dim = 100
vocab_size = len(vocab)
n_epochs = 20
batch_size = 64
hidden_size = 256
max_seq_len = 200
dropout = 0.2

In [9]:
emb = torchtext.vocab.GloVe("6B", dim=emb_dim)
# dictionary mapping of word idx to glove vectors
emb_weights = np.zeros((vocab_size, emb_dim))
words_found = 0
print("Embedding dim: {}".format(emb_weights.shape))

for token, idx in vocab.items():
    # emb.stoi is a dict of token to idx mapping
    if token in emb.stoi:
        emb_weights[idx] = emb[token]
        words_found += 1

print("vocab size: ", vocab_size)
print(words_found, " words are found in GloVe")

# Convert numpy matrix to tensor
emb_weights = torch.from_numpy(emb_weights).float()

emb_weights.shape

Embedding dim: (85034, 100)
vocab size:  85034
50456  words are found in GloVe


torch.Size([85034, 100])

In [0]:
def create_emb_layer(emb_weights):
    vocab_size, emb_dim = emb_weights.shape
    emb_layer = nn.Embedding(vocab_size, emb_dim)
    emb_layer.load_state_dict({'weight': emb_weights})

    return emb_layer

def loss_fn(pos_sim, neg_sim):
    margin = 0.2

    loss = margin - pos_sim + neg_sim
    if loss.data[0] < 0:
        loss.data[0] = 0
    return loss

In [0]:
class QA_LSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, dropout):

        super(QA_LSTM, self).__init__()

        # Shape - (max_seq_len, emb_dim)
        self.embedding = create_emb_layer(emb_weights)

        self.shared_lstm = nn.LSTM(emb_size, hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.cos = nn.CosineSimilarity(dim=1)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, q, a):
        # embedding
        q = self.embedding(q) # (bs, L, E)
        a = self.embedding(a) # (bs, L, E)

        # LSTM
        q, (hidden, cell) = self.shared_lstm(q) # (bs, L, 2H)
        a, (hidden, cell) = self.shared_lstm(a) # (bs, L, 2H)

        # Output shape (batch size, seq_len, num_direction * hidden_size)
        # There are n of word level biLSTM representations for the seq where n is the number of seq len
        # Use max pooling to generate the best representation
        q = torch.max(q, 1)[0] 
        a = torch.max(a, 1)[0] # (bs, 2H)

        q = self.dropout(q)
        a = self.dropout(a)

        return self.cos(q, a) # (bs,)

In [0]:
import torch
from torch.utils import data

class Dataset(data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, q_lst, pos_ans, neg_ans):
        'Initialization'
        self.q_lst = q_lst
        self.pos_ans_lst = pos_ans
        self.neg_ans_lst = neg_ans

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.q_lst)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.q_lst[index]

        # Load data and get label
        q = self.q_lst[index]
        pos_ans = self.pos_ans_lst[index]
        neg_ans = self.neg_ans_lst[index]

        return q, pos_ans, neg_ans

In [0]:
def train(model, train_set, optimizer, batch_size):

    # Cumulated Training loss
    training_loss = 0.0

    q_lst = []
    pos_lst = []
    neg_lst = []

    # Set model to training mode
    model.train()
 
    for i, seq in enumerate(train_set):

        ques, pos_ans, neg_ans = seq[0], seq[1], seq[2]

        q_text = qid_to_text[ques]
        q_vec = vectorize(q_text, vocab, max_seq_len)

        q_lst.append(q_vec)

        pos_ans_text = docid_to_text[pos_ans]
        pos_ans_vec = vectorize(pos_ans_text, vocab, max_seq_len)

        pos_lst.append(pos_ans_vec)

        neg_ans_text = docid_to_text[neg_ans]
        neg_ans_vec = vectorize(neg_ans_text, vocab, max_seq_len)
        neg_lst.append(neg_ans_vec)

    q_lst = torch.tensor(q_lst)
    pos_lst = torch.tensor(pos_lst)
    neg_lst = torch.tensor(neg_lst)

    train_data = Dataset(q_lst, pos_lst, neg_lst)

    train_loader = data.DataLoader(train_data, batch_size=batch_size)

    for ques, pos_ans, neg_ans in tqdm(train_loader):
        # 1. Zero the gradients
        optimizer.zero_grad()

        for q in ques:
            batch_q = q.to(device)

        for p in pos_ans:
            batch_pos = p.to(device)

        for n in neg_ans:
            batch_neg = n.to(device)
            
        # 2. Compute predictions
        pos_sim = model(batch_q, batch_pos)    
        neg_sim = model(batch_q, batch_neg)

        # 3. Compute loss
        loss = loss_fn(pos_sim, neg_sim)

        # 4. Use loss to compute gradients
        loss.backward()

        # 5. Use optimizer to take gradient step
        optimizer.step()
            
        training_loss += loss.item()
            
    return training_loss / len(train_loader)

In [0]:
def validate(model, valid_set, batch_size):

    # Cumulated Training loss
    valid_loss = 0.0

    q_lst = []
    pos_lst = []
    neg_lst = []

    # Set model to evaluation mode
    model.eval()
 
    for i, seq in enumerate(valid_set):

        ques, pos_ans, neg_ans = seq[0], seq[1], seq[2]

        q_text = qid_to_text[ques]
        q_vec = vectorize(q_text, vocab, max_seq_len)

        q_lst.append(q_vec)

        pos_ans_text = docid_to_text[pos_ans]
        pos_ans_vec = vectorize(pos_ans_text, vocab, max_seq_len)

        pos_lst.append(pos_ans_vec)

        neg_ans_text = docid_to_text[neg_ans]
        neg_ans_vec = vectorize(neg_ans_text, vocab, max_seq_len)

        neg_lst.append(neg_ans_vec)

    q_lst = torch.tensor(q_lst)
    pos_lst = torch.tensor(pos_lst)
    neg_lst = torch.tensor(neg_lst)

    valid_data = Dataset(q_lst, pos_lst, neg_lst)

    valid_loader = data.DataLoader(valid_data, batch_size=batch_size)
        
    # Don't calculate the gradients
    with torch.no_grad():

        for ques, pos_ans, neg_ans in tqdm(valid_loader):

            for q in ques:
                batch_q = q.to(device)

            for p in pos_ans:
                batch_pos = p.to(device)

            for n in neg_ans:
                batch_neg = n.to(device)
                
            pos_sim = model(batch_q, batch_pos)    
            neg_sim = model(batch_q, batch_neg)

            loss = loss_fn(pos_sim, neg_sim)
                
            valid_loss += loss.item()
                
        return valid_loss / len(valid_loader)

In [0]:
# model = QA_LSTM(vocab_size, emb_dim, hidden_size, dropout)
# model = model.to(device)

# seq = train_set[0]
# ques, pos_ans, neg_ans = seq[0], seq[1], seq[2]

# q_text = qid_to_text[ques]
# q_vec = torch.tensor(vectorize(q_text, vocab, max_seq_len)).to(device)

# pos_ans_text = docid_to_text[pos_ans]
# pos_ans_vec = torch.tensor(vectorize(pos_ans_text, vocab, max_seq_len)).to(device)

# neg_ans_text = docid_to_text[neg_ans]
# neg_ans_vec = torch.tensor(vectorize(neg_ans_text, vocab, max_seq_len)).to(device)

# pos_sim = model(q_vec, pos_ans_vec)
# neg_sim = model(q_vec, neg_ans_vec)

# loss_fn(pos_sim, neg_sim)

In [0]:
model = QA_LSTM(vocab_size, emb_dim, hidden_size, dropout)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Lowest validation lost
# best_valid_loss = float('inf')

# for epoch in range(n_epochs):

#     # Evaluate training loss
#     train_loss = train(model, train_set, optimizer, batch_size)
#     # Evaluate validation loss
#     valid_loss = validate(model, valid_set, batch_size)
    
#     # At each epoch, if the validation loss is the best
#     # if valid_loss < best_valid_loss:
#     #     best_valid_loss = valid_loss
#     #     now = datetime.now()
#     #     current_time = now.strftime("%Y%m%d_%H%M%S")
#         # Save the parameters of the model
#     torch.save(model.state_dict(), str(epoch)+'_model-eu.pt')

#     # torch.save(model.state_dict(),'model-0.pt')

#     print("\n\n Epoch {}:".format(epoch+1))
#     print("\t Train Loss: {}".format(round(train_loss, 3)))
#     print("\t Validation Loss: {}\n".format(round(valid_loss, 3)))

In [0]:
def eval(model, test_set, qid_rel, max_seq_len, k):
    
    qid_pred_rank = {}

    model.eval()

    for i, seq in enumerate(tqdm(test_set)):

        ques, pos_ans, cands = seq[0], seq[1], seq[2]

        q_text = qid_to_text[ques]
        q_vec = torch.tensor(vectorize(q_text, vocab, max_seq_len)).to(device)

        cands_text = [docid_to_text[c] if c is not 0 else "" for c in cands]

        scores = []

        cands_id = np.array(cands)

        for cand in cands_text:
            a_vec = torch.tensor(vectorize(cand, vocab, max_seq_len)).to(device)
            scores.append(model(q_vec, a_vec).item())

        sorted_index = np.argsort(scores)[::-1]

        ranked_ans = cands_id[sorted_index]

        qid_pred_rank[ques] = ranked_ans

    # return qid_pred_rank
    MRR, average_ndcg, precision = evaluate(qid_pred_rank, qid_rel, k)

    return qid_pred_rank, MRR, average_ndcg, precision

**Test**

In [0]:
# toy_test_label = dict(itertools.islice(test_qid_rel.items(), 10))

# toy_test_set = test_set[:10]

In [0]:
# print((toy_test_label))

In [0]:
# # Load the model with the best validation loss
# model.load_state_dict(torch.load('20200203_071508model-50.pt'))

# rank = eval(model, toy_test_set, toy_test_label, max_seq_len, k=10)

# # print(test_set[0][1])

# # print(rank[0])

In [0]:
# toy_test_label

In [0]:
# rank[80]

In [25]:
# Load the model with the best validation loss
model.load_state_dict(torch.load('20200203_103058_model-eu.pt'))

k = 10

qid_pred_rank, MRR, average_ndcg, precision = eval(model, test_set_full, test_qid_rel, max_seq_len, k=10)

num_q = len(test_set)

print("\n\nAverage nDCG@{} for {} queries: {}\n".format(k, num_q, average_ndcg))

print("MRR@{} for {} queries: {}\n".format(k, num_q, MRR))

print("Average Precision@{}: {}".format(1, precision))

100%|██████████| 330/330 [28:12<00:00,  5.10s/it]



Average nDCG@10 for 330 queries: 0.04858448433124229

MRR@10 for 330 queries: 0.029186329186329185

Average Precision@1: 0.012121212121212121





In [0]:
save_pickle(path+'qid_pred_rank.pickle', qid_pred_rank)

In [49]:
take(10, test_qid_rel.items())

[(1, [14255]),
 (25, [107584, 562777]),
 (43, [76662]),
 (80, [252473]),
 (460, [591357, 556021, 555097]),
 (474, [430901]),
 (509, [377152]),
 (523, [338348]),
 (541, [533825]),
 (543, [504419])]

In [55]:
i = 541

print(" ".join(qid_to_text[i]))
print()
docid = test_qid_rel[i]

print(" ".join(docid_to_text[docid[0]]))
print()
print(" ".join(docid_to_text[qid_pred_rank[i][0]]))
print()
print(" ".join(docid_to_text[test_set_full[8][2][0]]))

can i deduct my individual health insurance premium in tax

yes you can see the instructions for line 29 of form 1040 self employed health insurance premiums are an above the line deduction

now i have kept this money and after interval of 6 month or year whenever the usd price go up i do exchange with indian currency and deposit in my account now do i have to pay tax on this money no you are not required to pay any tax as the income was accrued when your were nri for tax purposes the foreign currency upto usd 2000 can be held by an individual without any time limit ie you can convert then whenever you want there is nothing that needs to be declared in tax returns

the basic idea is that the average person cant deduct health care costs unless theyre really onerous but a business can and as a self employed person you can deduct those costs from the businesses earnings as long as the business is really generating enough profit to cover the health insurance costs thats why most people get