### Read data

In [1]:
def read_file(f):
    data = open(f,'r').readlines()[1:]
    row_id = [i.split('\t')[0].strip() for i in data]
    data = [i.split('\t')[1].strip().split(' ') for i in data]
    return row_id,data

In [2]:
row_id_text, texts = read_file('./REVIEW_TEXT.txt')
row_id_tags, tags = read_file('./REVIEW_LABELSEQ.txt')

#For this demo, let's just use the first 100 sentences 
texts = texts[:100]
tags = tags[:100]

### Inputs

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

unique_words = list(set([j for i in texts for j in i]))
word2idx = {j:i+1 for i,j in enumerate(unique_words)}
word2idx["PAD"] = 0

unique_tags = list(set([j for i in tags for j in i]))
label2idx = {j:i for i,j in enumerate(unique_tags)}
idx2label = {j:i for i,j in label2idx.items()}

X = [[word2idx[j] for j in i] for i in texts]
X = pad_sequences(maxlen = 25, sequences = X, padding = "post", value = word2idx["PAD"])
y = [[label2idx[j] for j in i] for i in tags]
y = pad_sequences(maxlen = 25, sequences = y, padding = "post", value = label2idx["O"])
y = [to_categorical(i, num_classes = len(unique_tags)) for i in y]

ImportError: No module named 'tensorflow'

### Train and Validation sets

In [4]:
X_train, X_validation, y_train, y_validation  = train_test_split(X, y, test_size = 0.2)

### LSTM model 

In [5]:
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Embedding(input_dim=len(word2idx.keys()),output_dim=20,input_length=25))
model.add(Bidirectional(LSTM(units=50,return_sequences=True,dropout=0.2), merge_mode = 'concat'))
model.add(Dense(len(label2idx.keys()), activation="relu"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 20)            14540     
_________________________________________________________________
bidirectional (Bidirectional (None, 25, 100)           28400     
_________________________________________________________________
dense (Dense)                (None, 25, 4)             404       
Total params: 43,344
Trainable params: 43,344
Non-trainable params: 0
_________________________________________________________________


### Training & Prediction

In [6]:
import numpy as np
history = model.fit(X_train,np.array(y_train),batch_size=16,epochs=1,validation_split=0.1)

y_pred = model.predict(X_validation)
y_pred = np.argmax(y_pred, axis=-1)
y_validation = np.argmax(y_validation, -1)
y_pred = [[idx2label[i] for i in row] for row in y_pred]
y_validation = [[idx2label[i] for i in row] for row in y_validation]

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 72 samples, validate on 8 samples
Epoch 1/1


### Result

In [7]:
from sklearn_crfsuite.metrics import flat_classification_report

report = flat_classification_report(y_pred=y_pred, y_true=y_validation)
print(report)

              precision    recall  f1-score   support

        B-AE       0.00      0.00      0.00        32
       B-SSI       0.00      0.00      0.00         2
        I-AE       0.00      0.00      0.00        35
           O       0.86      1.00      0.93       431

    accuracy                           0.86       500
   macro avg       0.22      0.25      0.23       500
weighted avg       0.74      0.86      0.80       500



  _warn_prf(average, modifier, msg_start, len(result))
