In [1]:
import numpy as np
import pandas as pd
import sys
import os
import glob
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bielskic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bielskic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bielskic/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
def load_tags(path, cat = False):
    data = []
    for filename in glob.glob(os.path.join(path, "*.txt")):
        with open(filename, "r") as tag_file:           
            label = os.path.splitext(os.path.basename(filename))[0]
            if cat == True:
                sentence = " ".join(" ".join(line.strip() for line in tag_file).split(":"))
            else:
                sentence = (line.strip() for line in tag_file)
                sentence = (" ".join(" ".join(word.split(":")[1:]) for word in sentence))
            new_row = (sentence, label) 
            data.append(new_row)
            

    dt = np.dtype([('sentence', object), ('label', object)])
    return(np.array(data, dtype = dt))

In [4]:
train_tags_path = "data/tags_train/"
test_tags_path = "data/tags_test/"

### Tags with categories
raw_train_tags = load_tags(train_tags_path, cat = True)
raw_test_tags = load_tags(test_tags_path, cat = True)

### Tags without categories
raw_train_tags_nocat = load_tags(train_tags_path)
raw_test_tags_nocat = load_tags(test_tags_path)

In [5]:
print(raw_train_tags.shape, raw_test_tags.shape)
print(raw_train_tags[0:10])

(10000,) (2000,)
[('vehicle airplane outdoor bench sports skateboard person person vehicle truck accessory backpack accessory handbag furniture dining table', '0')
 ('kitchen bowl food carrot kitchen spoon', '1')
 ('accessory suitcase', '10') ('food cake', '100')
 ('outdoor traffic light', '1000') ('animal cat', '1001')
 ('vehicle airplane person person', '1002')
 ('vehicle car person person sports skis accessory handbag outdoor traffic light', '1003')
 ('person person electronic remote furniture couch', '1004')
 ('vehicle boat person person animal bird', '1005')]


In [6]:
print(raw_train_tags_nocat.shape, raw_test_tags_nocat.shape)
print(raw_train_tags_nocat[0:10])

(10000,) (2000,)
[('airplane bench skateboard person truck backpack handbag dining table', '0')
 ('bowl carrot spoon', '1') ('suitcase', '10') ('cake', '100')
 ('traffic light', '1000') ('cat', '1001') ('airplane person', '1002')
 ('car person skis handbag traffic light', '1003')
 ('person remote couch', '1004') ('boat person bird', '1005')]


In [8]:
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))
keep_pos_nouns = ['NN', 'NNS', 'NNP', 'NNPS']
keep_pos_all = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'VB', 'VBG', 'VBZ']
wnl = nltk.stem.WordNetLemmatizer()

def preprocess(data, pos_to_keep = None):
    
    out = data.copy()
    out['sentence'] = list(map(lambda x:x.lower(), out['sentence']))
    out['sentence'] = list(map(lambda x:''.join(ch for ch in x if ch not in punctuation), out['sentence']))
    out['sentence'] = list(map(lambda x:' '.join(w for w in x.split(' ') if w not in stop_words), out['sentence']))
    out['sentence'] = list(map(lambda x:' '.join(wnl.lemmatize(w) for w in x.split(' ')), out['sentence']))
    
    if pos_to_keep != None:
        out['sentence'] = list(map(lambda x:' '.join(w[0] for w in nltk.pos_tag(nltk.word_tokenize(x)) if w[1] in pos_to_keep), out['sentence']))
    
    return(out)

In [10]:
train_tag_all = preprocess(raw_train_tags)
test_tag_all = preprocess(raw_test_tags)

train_tag = preprocess(raw_train_tags, keep_pos_all)
test_tag =  preprocess(raw_test_tags, keep_pos_all)

train_tag_nocat = preprocess(raw_train_tags_nocat, keep_pos_all)
test_tag_nocat =  preprocess(raw_test_tags_nocat, keep_pos_all)

In [14]:
def vectorize(train_data, test_data):
    
    Tfidf = TfidfVectorizer(vocabulary = tag_dict, 
                            tokenizer = lambda str: str.split(" "))
    
    tr_d = [word for word in train_data['sentence'].tolist()] 
    te_d = [word for word in test_data['sentence'].tolist()] 
    
    Y_train = Tfidf.fit_transform(tr_d)
    Y_test = Tfidf.fit_transform(te_d)
    
    return(Y_train, Y_test)

In [39]:
tag_dict = set()
for s in train_tag['sentence']:
    tags = s.split()
    for tag in tags:
        tag_dict.add(tag)
        
print(len(tag_dict))
tag_dict_list = list(tag_dict)
tag_dict_list.sort()
tag_dict_list[0:20]

101


['accessory',
 'airplane',
 'animal',
 'apple',
 'appliance',
 'backpack',
 'ball',
 'banana',
 'baseball',
 'bat',
 'bear',
 'bed',
 'bench',
 'bicycle',
 'bird',
 'boat',
 'book',
 'bottle',
 'bowl',
 'broccoli']

In [40]:
### BOW with categories 

Y_train, Y_test = vectorize(train_tag, test_tag)
print(Y_train.shape, Y_train.shape)

(10000, 101) (10000, 101)


In [41]:
### BOW with categories 

print("train:")
print(Y_train[0])
print("test:")
print(Y_test[0])

train:
  (0, 98)	0.34534438667719125
  (0, 94)	0.2815251107733758
  (0, 85)	0.2354923920328581
  (0, 81)	0.18676698136694395
  (0, 65)	0.22974712352115403
  (0, 62)	0.22925214756478446
  (0, 46)	0.26879722504279113
  (0, 41)	0.17033679166940313
  (0, 31)	0.2354923920328581
  (0, 12)	0.284842935230572
  (0, 5)	0.2921512902238589
  (0, 1)	0.3396880170215032
  (0, 0)	0.40890456845147044
test:
  (0, 88)	0.37188894156615054
  (0, 83)	0.41059946675486214
  (0, 41)	0.20202717835040415
  (0, 5)	0.33403775326719826
  (0, 0)	0.7353301922547499


In [42]:
### BOW without categories 

_tag_dict_list = tag_dict_list
tag_dict = set()
for s in train_tag_nocat['sentence']:
    tags = s.split()
    for tag in tags:
        tag_dict.add(tag)
        
print(len(tag_dict))
tag_dict_list = list(tag_dict)
tag_dict_list.sort()
tag_dict_list[0:20]

92


['airplane',
 'apple',
 'backpack',
 'ball',
 'banana',
 'baseball',
 'bat',
 'bear',
 'bed',
 'bench',
 'bicycle',
 'bird',
 'boat',
 'book',
 'bottle',
 'bowl',
 'broccoli',
 'bus',
 'cake',
 'car']

In [43]:
print([t for t in _tag_dict_list if t not in tag_dict_list])

['accessory', 'animal', 'appliance', 'electronic', 'food', 'furniture', 'indoor', 'kitchen', 'outdoor', 'vehicle']


In [46]:
### BOW without categories 

Y_train_nocat, Y_test_nocat = vectorize(train_tag_nocat, test_tag_nocat)
print(Y_train_nocat.shape, Y_test_nocat.shape)

(10000, 92) (2000, 92)


In [47]:
### BOW without categories 

print("train:")
print(Y_train_nocat[0])
print("test:")
print(Y_test_nocat[0])

train:
  (0, 86)	0.37611893193939083
  (0, 77)	0.3146189934103292
  (0, 56)	0.15347164321779547
  (0, 40)	0.36612630542609204
  (0, 28)	0.3146189934103292
  (0, 9)	0.38055156172426136
  (0, 2)	0.39208327023739353
  (0, 0)	0.45212114336752823
test:
  (0, 80)	0.493714458737659
  (0, 75)	0.5412847965145303
  (0, 8)	0.5171151075938532
  (0, 2)	0.4425480400971191


In [55]:
if not os.path.isdir("data/processed_tags"):
    os.mkdir("data/processed_tags")

np.save("data/processed_tags/train_tag_tfdif.npy", Y_train)
np.save("data/processed_tags/test_tag_tfdif.npy", Y_test)
np.save("data/processed_tags/train_tag_nocat_tfdif.npy", Y_train_nocat)
np.save("data/processed_tags/test_tag_nocat_tfdif.npy", Y_test_nocat)