In [68]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow import keras
import csv
tf.config.run_functions_eagerly(True)

In [69]:
def get_model_list(f, delim="<|>"):
    with open(f,'r',encoding='utf-8') as f:
        d={}
        while True:
            try:
                line = f.readline()
                line = str(line).split(delim)
                d[str(line[0])]=int(str(line[1]).replace('\n',''))
            except:
                break
    f.close()
    return d

In [70]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,p) for w,p in zip(s["Token"].values.tolist(),
                                                           s["POS"].values.tolist())]
        
        self.grouped = self.data.groupby("Record Number").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    

In [4]:
model = keras.models.load_model("./model/model_2.3",compile=False)

In [71]:
#coded embedding indexing
word2idx = get_model_list("train_words.txt")
pos2idx = get_model_list("pos_tags.txt")
tag2idx = get_model_list("bio_tags.txt")
char2idx = get_model_list("train_chars.txt")

In [72]:
idx2tag = {i: w for w, i in tag2idx.items()}
idx2word = {i: w for w, i in word2idx.items()}
idx2char = {i: w for w, i in char2idx.items()}

In [73]:
df = pd.read_csv('data/qdf.csv')

In [74]:
df

Unnamed: 0,Record Number,Title,Token,POS
0,1,SALE ! Kamen Rider Gaim Belt Set No. 802,SALE,NN
1,1,SALE ! Kamen Rider Gaim Belt Set No. 802,!,.
2,1,SALE ! Kamen Rider Gaim Belt Set No. 802,Kamen,NNP
3,1,SALE ! Kamen Rider Gaim Belt Set No. 802,Rider,NNP
4,1,SALE ! Kamen Rider Gaim Belt Set No. 802,Gaim,NNP
...,...,...,...,...
257523,25000,BELLA TAYLOR PATCHWORK QUILTED BLAKELY SMALL /...,MEDIUM,NNP
257524,25000,BELLA TAYLOR PATCHWORK QUILTED BLAKELY SMALL /...,HANDBAG,NNP
257525,25000,BELLA TAYLOR PATCHWORK QUILTED BLAKELY SMALL /...,PURSE,NNP
257526,25000,BELLA TAYLOR PATCHWORK QUILTED BLAKELY SMALL /...,~,NNP


In [75]:
getter=SentenceGetter(df)

In [76]:
sentences = getter.sentences

In [77]:
max_len=128
max_len_char=128

In [78]:
X_word = [[word2idx[w[0]] if w[0] in word2idx.keys() else '1' for w in s] for s in sentences]
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')

In [79]:
X_pos = [[pos2idx[w[1]] if w[1] in pos2idx.keys() else '0' for w in s] for s in sentences]
X_pos = pad_sequences(maxlen=max_len, sequences=X_pos, value=pos2idx["PAD"], padding='post', truncating='post')

In [188]:
char2idx[str(sentences[0][1][0])]

79

In [197]:
X_char = []
for sentence in sentences:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                if i>=len(sentence):
                    word_seq.append(char2idx["PAD"])
                elif j>=len(str(sentence[i][0])):
                    word_seq.append(char2idx["PAD"])
                else:
                    t = str(sentences[i][0])
                    if not t[j] in char2idx.keys():
                        word_seq.append(char2idx["UNK"])
                    else:
                        word_seq.append(char2idx['PAD'])
                    
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))
XX_char = np.asarray(X_char).astype("float")
XX_char = np.where(XX_char==None, 1, XX_char)

In [201]:
XX_char = np.where(XX_char>=129,1,XX_char)

In [202]:
np.save("X_word_2.3.npy",X_word)
np.save("X_pos_2.3.npy",X_pos)
np.save("XX_char_2.3.npy",XX_char)

In [203]:
print("___Model Inputs___\n")
[print(i.shape, i.dtype) for i in model.inputs]
print("___Model Outputs___\n")
[print(o.shape, o.dtype) for o in model.outputs]
print("___Model Layers___\n")
[print(l.name, l.input_shape, l.dtype) for l in model.layers]
print("___Data Inputs___\n")
print(X_word.shape)
print(X_pos.shape)
print(XX_char.shape)

___Model Inputs___

(None, 128) <dtype: 'float32'>
(None, 128) <dtype: 'float32'>
(None, 128, 128) <dtype: 'float32'>
___Model Outputs___

(None, 128, 115) <dtype: 'float32'>
___Model Layers___

Char_Inputs [(None, 128, 128)] float32
Word_Inputs [(None, 128)] float32
POS_Inputs [(None, 128)] float32
Char_Time_Dist (None, 128, 128) float32
Word_Embed (None, 128) float32
POS_Embed (None, 128) float32
Char_LSTM_Enc (None, 128, 128, 32) float32
Main_Input [(None, 128, 64), (None, 128, 32), (None, 128, 32)] float32
Main_Dropout (None, 128, 128) float32
bidirectional_4 (None, 128, 128) float32
bidirectional_5 (None, 128, 1024) float32
add_2 [(None, 128, 1024), (None, 128, 1024)] float32
time_distributed_2 (None, 128, 1024) float32
___Data Inputs___

(25000, 128)
(25000, 128)
(25000, 128, 128)


In [204]:
pred_input = [X_word, X_pos, XX_char]

In [205]:
# y_pred = model.predict([X_word, X_pos, XX_char])
y_pred = model.predict(pred_input)



In [206]:
np.save("y_pred2.3.npy",y_pred)

In [66]:
pred_data = np.load("y_pred2.3.npy")

In [80]:
#unpack predictions into list for dataframe
idx=5001
preds=[]
for x in range(0,len(pred_data)):
    p = np.argmax(pred_data[x],axis=-1)

listvals=[[],[],[]]
for x in range(0,len(pred_data)):
    p = np.argmax(pred_data[x], axis=-1)
    
    for w, pred in zip(X_word[x], p):
        if w!=0:
            if idx2tag[pred]!="No Tag":
                listvals[0].append(idx)
                listvals[1].append(idx2tag[pred])
                listvals[2].append(idx2word[w])

    idx+=1

In [81]:
def parse_tags(df):
    tag_col = df.columns.get_loc("Aspect Name") 
    tkn_col = df.columns.get_loc("Aspect Value") 
    noparse=['Material', 'Type', 'Size', 'Fabric Type', 'Features','None','Occasion','Obscure']
    
    for x in range(0, len(df)-1):

        if df.iat[x,tag_col]!=None and df.iat[x,tag_col]!="None" and df.iat[x,tkn_col]!=None:
            cur_tag = str(df.iat[x,tag_col])
            
            if cur_tag[:2]=="B-" or cur_tag[:2]=="I-":
                if not cur_tag[2:] in noparse:
                    cur_tkn = str(df.iat[x,tkn_col])
                    y = 1
                    while True:
                        if x+y < len(df):
                            next_tag = str(df.iat[x+y,tag_col])
                            if next_tag[2:]==cur_tag[2:]:
                                if next_tag[:2]=="I-":
                                    cur_tkn = cur_tkn + " " + str(df.iat[x+y,tkn_col])
                                    df.iat[x+y,tkn_col]=None
                                    df.iat[x+y,tag_col]=None
                                elif next_tag[:2]=="E-":
                                    cur_tkn = cur_tkn + " " + str(df.iat[x+y,tkn_col])
                                    df.iat[x,tkn_col]=cur_tkn
                                    df.iat[x,tag_col]=cur_tag[2:]
                                    df.iat[x+y,tkn_col]=None
                                    df.iat[x+y,tag_col]=None
                                    break
                                else:
                                    df.iat[x,tkn_col]=cur_tkn
                                    df.iat[x,tag_col]=cur_tag[2:]
                                    break
                            else:
                                df.iat[x,tkn_col]=cur_tkn
                                df.iat[x,tag_col]=cur_tag[2:]
                                break
                        else:
                            df.iat[x,tkn_col]=cur_tkn
                            df.iat[x,tag_col]=cur_tag[2:]
                            break
                        y+=1
                else:
                    df.iat[x,tag_col]=cur_tag[2:]

            elif cur_tag[:2]=="E-":
                if cur_tag[2:] not in noparse:
                    if str(df.iat[x-1,tag_col])==cur_tag[2:]:
                        df.iat[x,tkn_col]=str(df.iat[x-1,tkn_col])+" "+str(df.iat[x,tkn_col])
                        df.iat[x,tag_col]=cur_tag[2:]
                        df.iat[x-1,tkn_col]=None
                        df.iat[x-1,tag_col]=None
                    else:
                        df.iat[x,tag_col]=cur_tag[2:]
                else:
                    df.iat[x,tag_col]=cur_tag[2:]
            
    df = df[df['Aspect Value']!="None"]            
    df = df.dropna()

In [82]:
pred_df = pd.DataFrame({"Record Number":listvals[0],"Aspect Name":listvals[1],"Aspect Value":listvals[2]})

In [83]:
pred_df[pred_df['Record Number']==5015]

Unnamed: 0,Record Number,Aspect Name,Aspect Value
108,5015,Theme,Rutoto
109,5015,B-Theme,ROOTOTE
110,5015,E-Theme,Rutoto
111,5015,Type,tote
112,5015,Type,bag
113,5015,Theme,deli
114,5015,E-Theme,smaller
115,5015,Department,ladies
116,5015,B-Theme,simple
117,5015,E-Theme,natur


In [84]:
parse_tags(pred_df)
pred_df = pred_df.dropna()

In [85]:
len(pred_df)

172244

In [238]:
pred_df[100:150]

Unnamed: 0,Record Number,Aspect Name,Aspect Value
108,5015,Theme,Rutoto
109,5015,Theme,ROOTOTE Rutoto
111,5015,Type,tote
112,5015,Type,bag
114,5015,Theme,deli smaller
115,5015,Department,ladies
116,5015,Theme,simple natur
118,5016,Model,Love
119,5016,Brand,Moschino
120,5016,Color,Black


In [86]:
pred_df.to_csv('sub_df4_c.csv',index=False,encoding='utf-8')

In [239]:
pred_df.to_csv("sub_df4.tsv",sep="\t",index=False,header=False,quoting=csv.QUOTE_NONE,encoding='utf-8')

In [6]:
tree_df = pd.read_csv("df_tokenadj.csv")

In [8]:
tree_df = tree_df.drop(columns=['Token','BIO_tag'])

In [9]:
tree_df

Unnamed: 0,Record Number,Title,Tag,POS,Token_adj
0,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,Brand,NNP,LOUIS VUITTON
1,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,MPN,NNP,M40096
2,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,Type,NNP,Handbag
3,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,Model,NNP,Priscilla
4,1,LOUIS VUITTON M40096 Handbag Priscilla Multi-c...,Color,NNP,Multi-color
...,...,...,...,...,...
48195,5000,Botkier Sasha Medium Duffel Bag Coral Leather ...,No Tag,NNP,Top
48196,5000,Botkier Sasha Medium Duffel Bag Coral Leather ...,No Tag,NNP,Closure
48197,5000,Botkier Sasha Medium Duffel Bag Coral Leather ...,No Tag,NNP,Retail
48198,5000,Botkier Sasha Medium Duffel Bag Coral Leather ...,No Tag,$,$


In [10]:
 from sklearn.ensemble import RandomForestClassifier

In [13]:
tree_w_emb = {}
tree_x = list(tree_df['Token_adj'])
tree_w_emb["UNK"]=0
idx=1
for w in tree_x:
    if not w in tree_w_emb.keys():
        tree_w_emb[w]=idx
        idx+=1

In [16]:
tree_tag_emb = {}
tree_y = list(tree_df['Tag'])
tree_tag_emb["UNK"]=0
idx=1
for w in tree_y:
    if not w in tree_tag_emb.keys():
        tree_tag_emb[w]=idx
        idx+=1

In [60]:
tree2word = {i: w for w, i in tree_w_emb.items()}
tree2tag = {i: w for w, i in tree_tag_emb.items()}

In [49]:
X = [tree_w_emb[x] for x in tree_x]
Y = [tree_tag_emb[y] for y in tree_y]

In [50]:
X = np.array(X)

In [51]:
X.shape

(48200,)

In [55]:
X = X.reshape(1,-1)

In [56]:
X = X.T
X.shape

(48200, 1)

In [57]:
Y = np.array(Y)

In [58]:
Y.shape

(48200,)

In [59]:
clf = RandomForestClassifier()
clf.fit(X,Y)

In [87]:
test_x = pred_df['Aspect Value']


In [89]:
test_x = list(test_x)
test_x

['Kamen Rider',
 'Gaim',
 'Belt',
 'HOBO',
 'INTERNATIONAL',
 "Women 's",
 'LEATHER',
 'Shoulder',
 'Bag',
 'CROSSBODY',
 'Coach',
 'Peyton',
 'Brown',
 'Tan',
 'Signature C',
 'Shoulder',
 'Bag',
 'Tote',
 'Bag',
 'MZ WALLACE',
 'Metro',
 'tote',
 'gray',
 'quilted',
 'nylon',
 'purse',
 'Women',
 'Sequins',
 'Backpack',
 'Travel',
 'PU',
 'Handbag',
 'Rucksack',
 'Shoulder',
 'School',
 'Bag',
 'Thirty One',
 'Large',
 'Tote',
 'FENDI',
 'BAG',
 'SHOULDER',
 'BAG',
 'ZUCCA',
 'Nylon',
 'Fashionic',
 "Women 's",
 'Shoulder',
 'Cross-Body',
 'Bag',
 'Lightweight',
 'Travel',
 'Beach',
 'Work',
 'Scho',
 'Dooney & Bourke',
 'Black',
 'Pebble',
 'Leather',
 'Tammy',
 'Tote',
 'Gucci',
 'Womens',
 'Indy',
 'Shoulder',
 'Handbag',
 'Brown',
 'Ostrich',
 'Bamboo',
 'Tassel',
 'Purse',
 'Evening',
 'Bag',
 'Brown',
 'Stain',
 'Silk',
 'Black',
 'Bead',
 'Satin',
 'Silk',
 'Betseyville',
 'Silver',
 'Embossed',
 'Star',
 'Patent',
 'Crossbody',
 'Shoulder',
 'Satchel',
 'Bag',
 'Carp',
 'Canv

In [91]:
test_x_enc = [tree_w_emb[x] if x in tree_w_emb.keys() else 0 for x in test_x ]

In [94]:
test_x_enc = np.asarray(test_x_enc).reshape(1,-1)
test_x_enc = test_x_enc.T

In [95]:
test_x_enc.shape

(172244, 1)

In [97]:
test_y = clf.predict(test_x_enc)

In [100]:
ty = list(test_y)

In [106]:
tree2tag[0]

'UNK'

In [101]:
tree_tags = [tree2tag[y] for y in ty]

In [102]:
tree_tags

['Brand',
 'Brand',
 'Type',
 'Type',
 'Brand',
 'Department',
 'Material',
 'Type',
 'Type',
 'Type',
 'Brand',
 'Model',
 'Color',
 'Color',
 'Pattern',
 'Type',
 'Type',
 'Type',
 'Type',
 'Brand',
 'Brand',
 'Type',
 'Color',
 'Features',
 'Material',
 'Type',
 'Department',
 'Accents',
 'Type',
 'Occasion',
 'Material',
 'Type',
 'Type',
 'Type',
 'Occasion',
 'Type',
 'Brand',
 'Size',
 'Type',
 'Brand',
 'Type',
 'Type',
 'Type',
 'Pattern',
 'Material',
 'Brand',
 'Department',
 'Type',
 'Type',
 'Type',
 'Features',
 'Occasion',
 'Occasion',
 'Occasion',
 'Brand',
 'Brand',
 'Color',
 'Pattern',
 'Material',
 'Brand',
 'Type',
 'Brand',
 'Department',
 'Product Line',
 'Type',
 'Type',
 'Color',
 'Material',
 'Product Line',
 'Accents',
 'Type',
 'Occasion',
 'Type',
 'Color',
 'Brand',
 'Material',
 'Color',
 'Handle/Strap Material',
 'Material',
 'Material',
 'Brand',
 'Color',
 'Accents',
 'No Tag',
 'Material',
 'Type',
 'Type',
 'Type',
 'Type',
 'Brand',
 'Fabric Type',


In [103]:
pred_df['TreeTags']=tree_tags

In [105]:
pred_df[pred_df['Aspect Name']!=pred_df['TreeTags']]

Unnamed: 0,Record Number,Aspect Name,Aspect Value,TreeTags
2,5001,Model,Gaim,Brand
24,5004,Model,Metro,Brand
60,5008,Occasion,Scho,Brand
67,5009,Model,Tammy,Brand
82,5011,Color,Stain,Brand
...,...,...,...,...
193030,29997,Model,Blackheath,Brand
193034,29998,Model,Piatto,Brand
193041,29999,Model,Jaald,Brand
193054,30000,Features,QUILTED,Accents


In [107]:
len(pred_df)

172244