<a href="https://colab.research.google.com/github/vishalveerareddy/NLP-Project/blob/main/NLPProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import torch

In [3]:
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
def answer_question(question, answer_text):
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example through the model.
    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                    return_dict=True) 

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    print('Answer: "' + answer + '"')

In [6]:
import textwrap

wrapper = textwrap.TextWrapper(width=80) 

bert_abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement)."

bert_abstract

'We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement)

In [7]:
question = "What does the 'B' in BERT stand for?"

answer_question(question, bert_abstract)

Query has 258 tokens.

Answer: "bidirectional encoder representations from transformers"


In [32]:
question = "Is BERT simple?"

answer_question(question, bert_abstract)

Query has 251 tokens.

Answer: "conceptually simple and empirically powerful"


In [8]:
import pickle
import numpy as np



In [9]:
with open('train_qa.txt','rb') as f:
    train_data=pickle.load(f)
with open("test_qa.txt",'rb') as f:
  test_data = pickle.load(f)

In [10]:
data = train_data + test_data




In [11]:
from sentence_transformers import SentenceTransformer

In [12]:
# Baseline
#utility function for BERT word embeddings
def bert_vectorize(sentences):
    #load pretrained BERT model
    model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
    #encode sentences
    vectors = model.encode(sentences)
    return list(vectors)

In [13]:
!pip install sentence_transformers




In [14]:
import gensim
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
from sklearn.preprocessing import MinMaxScaler
class EmbeddingVectorizer(object):
    def __init__(self, word2vec):
          self.word2vec = word2vec
          self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y):
            return self
    def transform(self, X):
            return np.array([
                np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                        or [np.zeros(self.dim)], axis=0)
                for words in X
            ])
def w2v(X_train, X_test):
  model = gensim.models.Word2Vec([doc for i, doc in enumerate(X_train+X_test)], min_count = 1, 
                              size = 100, window = 5)

  scaler = MinMaxScaler()


  d2v = dict(zip(model.wv.index2word, model.wv.syn0)) 
  modelw = EmbeddingVectorizer(d2v)
  # converting text to numerical data using Word2Vec 
  X_train_vectors_w2v = modelw.transform(X_train)
  X_train_vec = scaler.fit_transform(X_train_vectors_w2v)#Used for normalising the vector

  X_test_vectors_w2v = modelw.transform(X_test)
  X_test_vec = scaler.fit_transform(X_test_vectors_w2v)#Used for normalising the vector
  return [X_train_vec,X_test_vec]

  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
train_story_text=[]
train_question_text=[]
train_answers=[]

In [16]:
for story,question,answer in data:
    train_story_text.append(story)
    train_question_text.append(question)
    train_answers.append(answer)


In [17]:
sq_pairs = [str(train_story_text[i])+" "+str(train_question_text[i]) for i in range(len(train_story_text))]

In [24]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
word = []
for statement in sq_pairs:
  st = word_tokenize(statement)
  lemmatizer = WordNetLemmatizer()
  finalStats =[]
  for x in st:
    
    finalStats.append(lemmatizer.lemmatize(x))
  word.append(finalStats)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
word[0:5]

[['[',
  "'Mary",
  "'",
  ',',
  "'moved",
  "'",
  ',',
  "'to",
  "'",
  ',',
  "'the",
  "'",
  ',',
  "'bathroom",
  "'",
  ',',
  "'",
  '.',
  "'",
  ',',
  "'Sandra",
  "'",
  ',',
  "'journeyed",
  "'",
  ',',
  "'to",
  "'",
  ',',
  "'the",
  "'",
  ',',
  "'bedroom",
  "'",
  ',',
  "'",
  '.',
  "'",
  ']',
  '[',
  "'Is",
  "'",
  ',',
  "'Sandra",
  "'",
  ',',
  "'in",
  "'",
  ',',
  "'the",
  "'",
  ',',
  "'hallway",
  "'",
  ',',
  "'",
  '?',
  "'",
  ']'],
 ['[',
  "'Mary",
  "'",
  ',',
  "'moved",
  "'",
  ',',
  "'to",
  "'",
  ',',
  "'the",
  "'",
  ',',
  "'bathroom",
  "'",
  ',',
  "'",
  '.',
  "'",
  ',',
  "'Sandra",
  "'",
  ',',
  "'journeyed",
  "'",
  ',',
  "'to",
  "'",
  ',',
  "'the",
  "'",
  ',',
  "'bedroom",
  "'",
  ',',
  "'",
  '.',
  "'",
  ',',
  "'Mary",
  "'",
  ',',
  "'went",
  "'",
  ',',
  "'back",
  "'",
  ',',
  "'to",
  "'",
  ',',
  "'the",
  "'",
  ',',
  "'bedroom",
  "'",
  ',',
  "'",
  '.',
  "'",
  ',',
  "'Daniel",
  "'

In [18]:
#get word embeddings using BERT
vectorized_sq_pairs = bert_vectorize(sq_pairs)

KeyboardInterrupt: ignored

In [26]:
from sklearn.model_selection import train_test_split
X= word
y=train_answers

# X_train,X_test = w2v(X_train=X_train[0:5],X_test= X_test[0:5])


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [28]:
X_train,X_test = w2v(X_train=X_train,X_test= X_test)



In [29]:
#classify response type
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, classification_report
svmmodel = SVC(kernel='linear', degree=3, C=1, decision_function_shape='ovo').fit(X_train, y_train)
predictions = svmmodel.predict(X_test)
accuracy = svmmodel.score(X_test, y_test)
f1score = f1_score(y_test, predictions, average='micro')
print(classification_report(y_test, predictions))
print("Accuracy of SVM with Polynomial Kernel:", accuracy)

              precision    recall  f1-score   support

          no       0.49      0.58      0.53      1803
         yes       0.49      0.40      0.44      1827

    accuracy                           0.49      3630
   macro avg       0.49      0.49      0.48      3630
weighted avg       0.49      0.49      0.48      3630

Accuracy of SVM with Polynomial Kernel: 0.48650137741046834
