In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow import keras
import csv
tf.config.run_functions_eagerly(True)

In [8]:
def get_model_list(f, delim="<|>"):
    with open(f,'r',encoding='utf-8') as f:
        d={}
        while True:
            try:
                line = f.readline()
                line = str(line).split(delim)
                d[str(line[0])]=int(str(line[1]).replace('\n',''))
            except:
                break
    f.close()
    return d

In [9]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,p) for w,p in zip(s["Token"].values.tolist(),
                                                           s["POS"].values.tolist())]
        
        self.grouped = self.data.groupby("Record Number").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    

In [10]:
import keras.backend as K

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [14]:
model = keras.models.load_model("./model/model_2.2",compile=False)



In [11]:
#coded indexing
word2idx = get_model_list("train_words.txt")
pos2idx = get_model_list("pos_tags.txt")
tag2idx = get_model_list("bio_tags.txt")

In [12]:
idx2tag = {i: w for w, i in tag2idx.items()}
idx2word = {i: w for w, i in word2idx.items()}

In [13]:
df = pd.read_csv('data/qdf.csv')

In [14]:
df

Unnamed: 0,Record Number,Title,Token,POS
0,1,SALE ! Kamen Rider Gaim Belt Set No. 802,SALE,NN
1,1,SALE ! Kamen Rider Gaim Belt Set No. 802,!,.
2,1,SALE ! Kamen Rider Gaim Belt Set No. 802,Kamen,NNP
3,1,SALE ! Kamen Rider Gaim Belt Set No. 802,Rider,NNP
4,1,SALE ! Kamen Rider Gaim Belt Set No. 802,Gaim,NNP
...,...,...,...,...
257523,25000,BELLA TAYLOR PATCHWORK QUILTED BLAKELY SMALL /...,MEDIUM,NNP
257524,25000,BELLA TAYLOR PATCHWORK QUILTED BLAKELY SMALL /...,HANDBAG,NNP
257525,25000,BELLA TAYLOR PATCHWORK QUILTED BLAKELY SMALL /...,PURSE,NNP
257526,25000,BELLA TAYLOR PATCHWORK QUILTED BLAKELY SMALL /...,~,NNP


In [15]:
getter=SentenceGetter(df)

In [16]:
sentences = getter.sentences

In [17]:
max_len=128

In [18]:
X_word = [[word2idx[w[0]] if w[0] in word2idx.keys() else '1' for w in s] for s in sentences]
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')

In [19]:
X_pos = [[pos2idx[w[1]] if w[1] in pos2idx.keys() else '0' for w in s] for s in sentences]
X_pos = pad_sequences(maxlen=max_len, sequences=X_pos, value=pos2idx["PAD"], padding='post', truncating='post')

In [30]:
print("___Model Inputs___\n")
[print(i.shape, i.dtype) for i in model.inputs]
print("___Model Outputs___\n")
[print(o.shape, o.dtype) for o in model.outputs]
print("___Model Layers___\n")
[print(l.name, l.input_shape, l.dtype) for l in model.layers]
print("___Data Inputs___\n")
print(X_word.shape)
print(X_pos.shape)


___Model Inputs___

(None, 128) <dtype: 'float32'>
(None, 128) <dtype: 'float32'>
___Model Outputs___

(None, 128, 115) <dtype: 'float32'>
___Model Layers___

Word_Inputs [(None, 128)] float32
POS_Inputs [(None, 128)] float32
Word_Embed (None, 128) float32
POS_Embed (None, 128) float32
Main_Input [(None, 128, 64), (None, 128, 32)] float32
Main_Dropout (None, 128, 96) float32
bidirectional (None, 128, 96) float32
bidirectional_1 (None, 128, 1024) float32
add [(None, 128, 1024), (None, 128, 1024)] float32
time_distributed (None, 128, 1024) float32
___Data Inputs___

(25000, 128)
(25000, 128)


In [31]:
pred_input = [X_word, X_pos]

In [32]:
# y_pred = model.predict([X_word, X_pos, XX_char])
y_pred = model.predict(pred_input)





In [33]:
np.save("y_pred2.2.npy",y_pred)

In [20]:
pred_data = np.load("y_pred2.2.npy")

In [21]:
idx=5001

In [22]:
preds=[]
for x in range(0,len(pred_data)):
    p = np.argmax(pred_data[x],axis=-1)
    

In [23]:
listvals=[[],[],[]]

In [24]:
for x in range(0,len(pred_data)):
    p = np.argmax(pred_data[x], axis=-1)
    
    for w, pred in zip(X_word[x], p):
        if w!=0:
            if idx2tag[pred]!="No Tag":
                listvals[0].append(idx)
                listvals[1].append(idx2tag[pred])
                listvals[2].append(idx2word[w])

    idx+=1

In [143]:
# def parse_tags(df):
#     tag_col = df.columns.get_loc("Aspect Name") 
#     tkn_col = df.columns.get_loc("Aspect Value") 
#     noparse=['Material', 'Type', 'Size', 'Fabric Type', 'Features','None','Occasion','Obscure']
    
#     for x in range(0, len(df)-1):

#         if df.iat[x,tag_col]!=None and df.iat[x,tag_col]!="None" and df.iat[x,tkn_col]!=None:
#             cur_tag = str(df.iat[x,tag_col])
            
#             if cur_tag[:2]=="B-" or cur_tag[:2]=="I-":
#                 if not cur_tag[2:] in noparse:
#                     cur_tkn = str(df.iat[x,tkn_col])
#                     y = 1
#                     while True:
#                         if x+y < len(df):
#                             next_tag = str(df.iat[x+y,tag_col])
#                             if next_tag[2:]==cur_tag[2:]:
#                                 if next_tag[:2]=="I-":
#                                     cur_tkn = cur_tkn + " " + str(df.iat[x+y,tkn_col])
#                                     df.iat[x+y,tkn_col]=None
#                                     df.iat[x+y,tag_col]=None
#                                 elif next_tag[:2]=="E-":
#                                     cur_tkn = cur_tkn + " " + str(df.iat[x+y,tkn_col])
#                                     df.iat[x,tkn_col]=cur_tkn
#                                     df.iat[x,tag_col]=cur_tag[2:]
#                                     df.iat[x+y,tkn_col]=None
#                                     df.iat[x+y,tag_col]=None
#                                     break
#                                 else:
#                                     df.iat[x,tkn_col]=cur_tkn
#                                     df.iat[x,tag_col]=cur_tag[2:]
#                                     break
#                             else:
#                                 df.iat[x,tkn_col]=cur_tkn
#                                 df.iat[x,tag_col]=cur_tag[2:]
#                                 break
#                         else:
#                             df.iat[x,tkn_col]=cur_tkn
#                             df.iat[x,tag_col]=cur_tag[2:]
#                             break
#                         y+=1
#                 else:
#                     df.iat[x,tag_col]=cur_tag[2:]

#             elif cur_tag[:2]=="E-":
#                 df.iat[x,tag_col]=cur_tag[2:]
            
#     df = df[df['Aspect Value']!="None"]            
#     df = df.dropna()

In [25]:
def parse_tags(df):
    tag_col = df.columns.get_loc("Aspect Name") 
    tkn_col = df.columns.get_loc("Aspect Value") 
    noparse=['Material', 'Type', 'Size', 'Fabric Type', 'Features','None','Occasion','Obscure']
    
    for x in range(0, len(df)-1):

        if df.iat[x,tag_col]!=None and df.iat[x,tag_col]!="None" and df.iat[x,tkn_col]!=None:
            cur_tag = str(df.iat[x,tag_col])
            
            if cur_tag[:2]=="B-" or cur_tag[:2]=="I-":
                if not cur_tag[2:] in noparse:
                    cur_tkn = str(df.iat[x,tkn_col])
                    y = 1
                    while True:
                        if x+y < len(df):
                            next_tag = str(df.iat[x+y,tag_col])
                            if next_tag[2:]==cur_tag[2:]:
                                if next_tag[:2]=="I-":
                                    cur_tkn = cur_tkn + " " + str(df.iat[x+y,tkn_col])
                                    df.iat[x+y,tkn_col]=None
                                    df.iat[x+y,tag_col]=None
                                elif next_tag[:2]=="E-":
                                    cur_tkn = cur_tkn + " " + str(df.iat[x+y,tkn_col])
                                    df.iat[x,tkn_col]=cur_tkn
                                    df.iat[x,tag_col]=cur_tag[2:]
                                    df.iat[x+y,tkn_col]=None
                                    df.iat[x+y,tag_col]=None
                                    break
                                else:
                                    df.iat[x,tkn_col]=cur_tkn
                                    df.iat[x,tag_col]=cur_tag[2:]
                                    break
                            else:
                                df.iat[x,tkn_col]=cur_tkn
                                df.iat[x,tag_col]=cur_tag[2:]
                                break
                        else:
                            df.iat[x,tkn_col]=cur_tkn
                            df.iat[x,tag_col]=cur_tag[2:]
                            break
                        y+=1
                else:
                    df.iat[x,tag_col]=cur_tag[2:]

            elif cur_tag[:2]=="E-":
                if cur_tag[2:] not in noparse:
                    if str(df.iat[x-1,tag_col])==cur_tag[2:]:
                        df.iat[x,tkn_col]=str(df.iat[x-1,tkn_col])+" "+str(df.iat[x,tkn_col])
                        df.iat[x,tag_col]=cur_tag[2:]
                        df.iat[x-1,tkn_col]=None
                        df.iat[x-1,tag_col]=None
                    else:
                        df.iat[x,tag_col]=cur_tag[2:]
                else:
                    df.iat[x,tag_col]=cur_tag[2:]
            
    df = df[df['Aspect Value']!="None"]            
    df = df.dropna()

In [26]:
pred_df = pd.DataFrame({"Record Number":listvals[0],"Aspect Name":listvals[1],"Aspect Value":listvals[2]})

In [27]:
parse_tags(pred_df)
pred_df = pred_df.dropna()

In [28]:
pred_df[pred_df['Aspect Value']=="No Tag"]

Unnamed: 0,Record Number,Aspect Name,Aspect Value


In [29]:
pred_df[50:100]

Unnamed: 0,Record Number,Aspect Name,Aspect Value
56,5008,Type,Shoulder
57,5008,Type,Cross-Body
58,5008,Type,Bag
59,5008,Type,Lightweight
60,5008,Occasion,Travel
61,5008,Occasion,Beach
62,5008,Occasion,Work
63,5009,Brand,Dooney & Bourke
66,5009,Color,Black
67,5009,Pattern,Pebble


In [30]:
len(pred_df)

173035

In [148]:
len(pred_df)

173790

In [31]:
pred_df.to_csv("sub_df3.1.tsv",sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE,encoding='utf-8')