In [1]:
!pip install tensorflow



In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding

In [3]:
print("Tensorflow Version:", tf.__version__)

Tensorflow Version: 2.7.0


In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Nov 29 22:49:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [56]:
trainFilePath = '/content/drive/MyDrive/NLP/S21-gene-train.txt'
glove50Path = '/content/drive/MyDrive/NLP/glove.6B.50d.txt'
modelPath = '/content/drive/MyDrive/NLP/Model/RNN/'
testFilePath = '/content/drive/MyDrive/NLP/F21-gene-test.txt'

In [6]:
def getRawData(path):
  with open(path) as f:
    rawData = f.readlines()
  return rawData  

In [7]:
def processInput(rawData):
  tags = []
  sentence = []
  sentences = []
  labels = []
  for idx,line in enumerate(rawData):
    strippedLine = line.strip()
    if(len(strippedLine)>0):
      idx,word,tag = strippedLine.split("\t")
      sentence.append(word)
      tags.append(tag)
    else:
      # processedData.append([sentence,tags])
      sentences.append(sentence)
      labels.append(tags)
      sentence = []
      tags = []
   
  return sentences,labels   

In [8]:
rawData = getRawData(trainFilePath)

In [9]:
sentences, labels = processInput(rawData)

In [10]:
print(len(sentences))

13795


In [11]:
xTrain, xTest, yTrain, yTest = train_test_split(sentences,labels, test_size=0.15)

In [12]:
print(len(xTrain), len(xTest), len(yTrain), len(yTest))

11725 2070 11725 2070


## Storing word Frequencies to replace the words with less than threshold count as UNK

In [13]:
trainWordsList = list([tag for sentence in xTrain for tag in sentence])

In [14]:
trainWordsSet = set(trainWordsList)

In [15]:
np.random.seed(15)
toBeUNK = list(np.random.choice(list(trainWordsSet), 30, replace=False)) #replace UNK words add it manually

In [58]:
print(len(toBeUNK))

[]


In [17]:
for word in toBeUNK:
  print("Word", word, "Freq", trainWordsList.count(word))

Word TAAT Freq 1
Word breathalyzer Freq 1
Word multiphoton Freq 1
Word daltons Freq 2
Word approximation Freq 2
Word overnight Freq 2
Word sos2 Freq 1
Word cyclic Freq 24
Word parathyroidectomized Freq 1
Word c101F1 Freq 1
Word frame Freq 71
Word GTPases Freq 5
Word MVP Freq 1
Word often Freq 22
Word decarboxylase Freq 7
Word inducibility Freq 13
Word organic Freq 7
Word cyclohexane Freq 1
Word myopia Freq 2
Word Intraoperative Freq 1
Word CaCl2 Freq 2
Word Oc Freq 1
Word dissipations Freq 1
Word AACAG Freq 1
Word VIP Freq 4
Word pairwise Freq 1
Word tau Freq 6
Word adenines Freq 1
Word surgical Freq 24
Word KGF Freq 1


In [18]:
xTrainUpdated = []
toBeUNKCopy = toBeUNK

for sentence in xTrain:
  modified = []
  for word in sentence:
    if word in toBeUNKCopy:
      modified.append('UNK')
      toBeUNKCopy.pop(toBeUNKCopy.index(word)) 
    else:
      modified.append(word)
  xTrainUpdated.append(modified)

In [19]:
print(len(xTrainUpdated))

11725


In [20]:
print("UNK Count", len([word for sentence in xTrainUpdated for word in sentence if word is 'UNK']))

UNK Count 30


## Prepare the data

In [21]:
# Helper method for Tag to Index
def getTokenIndexDict(vocabList):
    uniqueVocabulary = list(set([tag for sentence in vocabList for tag in sentence]))
    tokenToIndex = {token:idx for idx, token in enumerate(uniqueVocabulary)}
    indexToToken = {idx:token for idx, token in enumerate(uniqueVocabulary)}
    return tokenToIndex, indexToToken

In [22]:
tagToIndex, indexToTag = getTokenIndexDict(yTrain)

In [23]:
print("Tag to index", tagToIndex)
print("Index to Tag", indexToTag)

Tag to index {'I': 0, 'B': 1, 'O': 2}
Index to Tag {0: 'I', 1: 'B', 2: 'O'}


In [24]:
trainTagEncodings = [[tagToIndex[tag] for tag in sentence] for sentence in yTrain]

In [25]:
print("Train tag Encodings", len(trainTagEncodings))

Train tag Encodings 11725


In [26]:
sentenceToIndex, indexToSentence = getTokenIndexDict(xTrainUpdated)

In [27]:
sentenceToIndex['PAD'] = len(sentenceToIndex)
indexToSentence[len(indexToSentence)]='PAD'

In [28]:
trainSentenceEncodings = [[sentenceToIndex[tag] for tag in sentence] for sentence in xTrainUpdated]

In [29]:
print("Train Sentence Encodings", len(trainSentenceEncodings))

Train Sentence Encodings 11725


In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [31]:
def getPaddedData(sentenceEncodings, labelEncodings, numTags, maxLen):
  padTokens = pad_sequences(sentenceEncodings, maxlen=maxLen, dtype='int32', padding='post', value=sentenceToIndex['PAD'])
  padTags = pad_sequences(labelEncodings, maxlen=maxLen, dtype='int32', padding='post', value= tagToIndex["O"])
  padTags = [to_categorical(i, num_classes=numTags) for i in padTags]
  return padTokens,np.array(padTags)

In [32]:
padTokens, padTags = getPaddedData(trainSentenceEncodings,trainTagEncodings,3,300)

In [33]:
padTokens.shape

(11725, 300)

In [34]:
padTags.shape

(11725, 300, 3)

In [35]:
# any([2 in sentence for sentence in padTags])

## Build the Model

In [36]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [37]:
def getModel():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=len(trainWordsSet)+2, output_dim=512))
    
    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=200, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
    model.add(Dropout(0.2))

    # Add LSTM
    model.add(LSTM(units=100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dropout(0.1))

    #Optimiser 
    model.add(Dense(3, activation="softmax"))

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['AUC'])
    model.summary()
    
    return model

In [38]:
model = getModel()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 512)         14586880  
                                                                 
 bidirectional (Bidirectiona  (None, None, 400)        1140800   
 l)                                                              
                                                                 
 dropout (Dropout)           (None, None, 400)         0         
                                                                 
 lstm_1 (LSTM)               (None, None, 100)         200400    
                                                                 
 dropout_1 (Dropout)         (None, None, 100)         0         
                                                                 
 dense (Dense)               (None, None, 3)           303       
                                                        

In [39]:
model.fit(padTokens, padTags, batch_size=128, verbose=1, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4c800a0910>

In [57]:
model.save(modelPath)

INFO:tensorflow:Assets written to: /content/drive/MyDrive/NLP/Model/RNN/assets




##Preparing the test data

In [41]:
trainSetUpdatedWords = set([word for sentence in xTrainUpdated for word in sentence])

In [42]:
xTestUpdated = [['UNK' if word not in trainSetUpdatedWords else word for word in sentence]for sentence in xTest]

In [43]:
print("Unk Count", len([word for sentence in xTestUpdated for word in sentence if word=='UNK']))

Unk Count 3108


In [44]:
print("Total Words", len([word for sentence in xTestUpdated for word in sentence]))

Total Words 58322


In [45]:
testSentenceEncodings = [[sentenceToIndex[tag] for tag in sentence] for sentence in xTestUpdated]

In [46]:
testTagEncodings = [[tagToIndex[tag] for tag in sentence] for sentence in yTest]

In [47]:
padTestTokens, padTestTags = getPaddedData(testSentenceEncodings,testTagEncodings,3,300)

In [48]:
predictedTagTokens = model.predict(padTestTokens)

In [49]:
yPredIndices = np.argmax(predictedTagTokens,axis=2)

In [50]:
yPredIndices.shape

(2070, 300)

In [51]:
yPredicted = np.array([[indexToTag[index] for index in sentence] for sentence in yPredIndices])

In [52]:
np.unique(yPredicted)

array(['B', 'I', 'O'], dtype='<U1')

In [53]:
print(yPredicted.shape)

(2070, 300)


In [78]:
ActualValuesPath = '/content/drive/MyDrive/NLP/golden.txt'
PredictedValuesPath = '/content/drive/MyDrive/NLP/predicted.txt'
TestResultsPath = '/content/drive/MyDrive/NLP/test_results.txt'

In [55]:
with open(ActualValuesPath, 'w') as actual, open(PredictedValuesPath, 'w') as predict:
  for idx, pred in enumerate(yPredicted):
    # actualLength = len(yTest[idx])
    for lineNumber, word in enumerate(xTest[idx]):
      actual.write(f"{lineNumber+1}\t{word}\t{yTest[idx][lineNumber]}")
      actual.write("\n")
      predict.write(f"{lineNumber+1}\t{word}\t{pred[lineNumber]}")
      predict.write("\n")
    actual.write("\n")
    predict.write("\n")  

## Predicting the test data

In [59]:
def processTestData(rawData):
  sentence = []
  sentences = []
  for idx,line in enumerate(rawData):
    strippedLine = line.strip()
    if(len(strippedLine)>0):
      idx,word = strippedLine.split("\t")
      sentence.append(word)
      
    else:
      # processedData.append([sentence,tags])
      sentences.append(sentence)
      sentence = []
  return sentences   

In [60]:
testRawData = getRawData(testFilePath)


In [61]:
testSentences = processTestData(testRawData)

In [63]:
print(len(testSentences))

508


In [64]:
xTestDataUpdated = [['UNK' if word not in trainSetUpdatedWords else word for word in sentence]for sentence in testSentences]

In [66]:
len(xTestDataUpdated)

508

In [68]:
print("UNK word Count", len([word for sentence in xTestDataUpdated for word in sentence if word=='UNK']))

UNK word Count 892


In [69]:
testDataSentenceEncodings = [[sentenceToIndex[tag] for tag in sentence] for sentence in xTestDataUpdated]

In [71]:
paddedTestData = pad_sequences(testDataSentenceEncodings, maxlen=300, dtype='int32', padding='post', value=sentenceToIndex['PAD'])


In [72]:
paddedTestData.shape

(508, 300)

In [73]:
predictedTestTagTokens = model.predict(paddedTestData)

In [74]:
yTestPredIndices = np.argmax(predictedTestTagTokens,axis=2)

In [75]:
yTestPredicted = np.array([[indexToTag[index] for index in sentence] for sentence in yTestPredIndices])

In [76]:
yTestPredicted.shape

(508, 300)

In [77]:
np.unique(yTestPredicted)

array(['B', 'I', 'O'], dtype='<U1')

In [79]:
with open(TestResultsPath, 'w') as testResults:
  for idx, pred in enumerate(yTestPredicted):
    # actualLength = len(yTest[idx])
    for lineNumber, word in enumerate(testSentences[idx]):
      testResults.write(f"{lineNumber+1}\t{word}\t{yTestPredicted[idx][lineNumber]}")
      testResults.write("\n")
    testResults.write("\n") 