In [20]:
import numpy as np
import pandas as pd
import scipy
import sys
import os
import glob
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/walid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/walid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/walid/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [22]:
def load_tags(path, cat = False):
    data = []
    for filename in glob.glob(os.path.join(path, "*.txt")):
        with open(filename, "r") as tag_file:           
            label = os.path.splitext(os.path.basename(filename))[0]
            if cat == True:
                sentence = " ".join(" ".join(line.strip() for line in tag_file).split(":"))
            else:
                sentence = (line.strip() for line in tag_file)
                sentence = (" ".join(" ".join(word.split(":")[1:]) for word in sentence))
            new_row = (sentence, label) 
            data.append(new_row)
            

    dt = np.dtype([('sentence', object), ('label', 'int64')])
    return(np.array(data, dtype = dt))

In [23]:
train_tags_path = "data/tags_train/"
test_tags_path = "data/tags_test/"

### Tags with categories
raw_train_tags = load_tags(train_tags_path, cat = True)
raw_test_tags = load_tags(test_tags_path, cat = True)

### Tags without categories
raw_train_tags_nocat = load_tags(train_tags_path)
raw_test_tags_nocat = load_tags(test_tags_path)

In [26]:
print(raw_train_tags.shape, raw_test_tags.shape)
print(raw_train_tags[0:10])

(10000,) (2000,)
[('vehicle airplane outdoor bench sports skateboard person person vehicle truck accessory backpack accessory handbag furniture dining table',    0)
 ('kitchen bowl food carrot kitchen spoon',    1)
 ('accessory suitcase',   10) ('food cake',  100)
 ('outdoor traffic light', 1000) ('animal cat', 1001)
 ('vehicle airplane person person', 1002)
 ('vehicle car person person sports skis accessory handbag outdoor traffic light', 1003)
 ('person person electronic remote furniture couch', 1004)
 ('vehicle boat person person animal bird', 1005)]


In [27]:
print(raw_train_tags_nocat.shape, raw_test_tags_nocat.shape)
print(raw_train_tags_nocat[0:10])

(10000,) (2000,)
[('airplane bench skateboard person truck backpack handbag dining table',    0)
 ('bowl carrot spoon',    1) ('suitcase',   10) ('cake',  100)
 ('traffic light', 1000) ('cat', 1001) ('airplane person', 1002)
 ('car person skis handbag traffic light', 1003)
 ('person remote couch', 1004) ('boat person bird', 1005)]


In [28]:
raw_train_tags.sort(order = 'label', axis = 0)
raw_test_tags.sort(order = 'label', axis = 0)

raw_train_tags_nocat.sort(order = 'label', axis = 0)
raw_test_tags_nocat.sort(order = 'label', axis = 0)

In [29]:
print(raw_train_tags[0:10])
print(raw_train_tags_nocat[0:10])

[('vehicle airplane outdoor bench sports skateboard person person vehicle truck accessory backpack accessory handbag furniture dining table', 0)
 ('kitchen bowl food carrot kitchen spoon', 1)
 ('vehicle car vehicle truck outdoor traffic light person person', 2)
 ('person person outdoor bench sports frisbee vehicle car', 3)
 ('person person sports baseball bat', 4)
 ('furniture bed furniture chair electronic mouse electronic keyboard indoor book kitchen cup electronic tv electronic laptop', 5)
 ('person person food donut vehicle bicycle', 6)
 ('person person accessory tie', 7) ('vehicle car person person', 8)
 ('vehicle car vehicle bus accessory backpack', 9)]
[('airplane bench skateboard person truck backpack handbag dining table', 0)
 ('bowl carrot spoon', 1) ('car truck traffic light person', 2)
 ('person bench frisbee car', 3) ('person baseball bat', 4)
 ('bed chair mouse keyboard book cup tv laptop', 5)
 ('person donut bicycle', 6) ('person tie', 7) ('car person', 8)
 ('car bus bac

In [65]:
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))
keep_pos_nouns = ['NN', 'NNS', 'NNP', 'NNPS']
keep_pos_all = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'VB', 'VBG', 'VBZ']
wnl = nltk.stem.WordNetLemmatizer()

def preprocess(data, pos_to_keep = None, lemma = False):
    
    out = data.copy()
    out['sentence'] = list(map(lambda x:x.lower(), out['sentence']))
    out['sentence'] = list(map(lambda x:''.join(ch for ch in x if ch not in punctuation), out['sentence']))
    out['sentence'] = list(map(lambda x:' '.join(w for w in x.split(' ') if w not in stop_words), out['sentence']))
    if lemma == True:
        out['sentence'] = list(map(lambda x:' '.join(wnl.lemmatize(w) for w in x.split(' ')), out['sentence']))
    
    if pos_to_keep != None:
        out['sentence'] = list(map(lambda x:' '.join(w[0] for w in nltk.pos_tag(nltk.word_tokenize(x)) if w[1] in pos_to_keep), out['sentence']))
    
    return(out)

In [37]:
train_tag_all = preprocess(raw_train_tags, lemma = True)
test_tag_all = preprocess(raw_test_tags, lemma = True)

train_tag = preprocess(raw_train_tags, keep_pos_all, lemma = True)
test_tag =  preprocess(raw_test_tags, keep_pos_all, lemma = True)

train_tag_nocat = preprocess(raw_train_tags_nocat, keep_pos_all, lemma = True)
test_tag_nocat =  preprocess(raw_test_tags_nocat, keep_pos_all, lemma = True)

In [66]:

train_tag_all_noLemma = preprocess(raw_train_tags)
test_tag_all_noLemma = preprocess(raw_test_tags)

train_tag_noLemma = preprocess(raw_train_tags, keep_pos_all)
test_tag_noLemma =  preprocess(raw_test_tags, keep_pos_all)

train_tag_nocat_noLemma = preprocess(raw_train_tags_nocat, keep_pos_all)
test_tag_nocat_noLemma =  preprocess(raw_test_tags_nocat, keep_pos_all)

In [40]:
def vectorize(train_data, test_data, binary = False):
    
    Tfidf = TfidfVectorizer(vocabulary = tag_dict, binary = binary,
                            tokenizer = lambda str: str.split(" "))
    
    tr_d = [word for word in train_data['sentence'].tolist()] 
    te_d = [word for word in test_data['sentence'].tolist()] 
    
    Y_train = Tfidf.fit_transform(tr_d)
    Y_test = Tfidf.fit_transform(te_d)
    
    return(Y_train, Y_test)

In [67]:
tag_dict = set()
for s in train_tag['sentence']:
    tags = s.split()
    for tag in tags:
        tag_dict.add(tag)
        
print(len(tag_dict))
tag_dict_list = list(tag_dict)
tag_dict_list.sort()
tag_dict_list[0:20]


101


['accessory',
 'airplane',
 'animal',
 'apple',
 'appliance',
 'backpack',
 'ball',
 'banana',
 'baseball',
 'bat',
 'bear',
 'bed',
 'bench',
 'bicycle',
 'bird',
 'boat',
 'book',
 'bottle',
 'bowl',
 'broccoli']

In [68]:
### BOW with categories 

Y_train, Y_test = vectorize(train_tag, test_tag)
Y_train_binary, Y_test_binary = vectorize(train_tag, test_tag, binary=True)

print(Y_train.shape, Y_train.shape)

(10000, 101) (10000, 101)


In [70]:
### BOW without categories 

_tag_dict_list = tag_dict_list
tag_dict = set()
for s in train_tag_nocat['sentence']:
    tags = s.split()
    for tag in tags:
        tag_dict.add(tag)
        
print(len(tag_dict))
tag_dict_list = list(tag_dict)
tag_dict_list.sort()
tag_dict_list[0:20]


92


['airplane',
 'apple',
 'backpack',
 'ball',
 'banana',
 'baseball',
 'bat',
 'bear',
 'bed',
 'bench',
 'bicycle',
 'bird',
 'boat',
 'book',
 'bottle',
 'bowl',
 'broccoli',
 'bus',
 'cake',
 'car']

In [89]:
### BOW without categories 

Y_train_nocat, Y_test_nocat = vectorize(train_tag_nocat, test_tag_nocat)
Y_train_nocat_binary, Y_test_nocat_binary = vectorize(train_tag_nocat, test_tag_nocat, binary=True)

print(Y_train_nocat.shape, Y_test_nocat.shape)

(10000, 92) (2000, 92)


In [95]:
### BOW noLemma

tag_dict = set()
for s in train_tag_noLemma['sentence']:
    tags = s.split()
    for tag in tags:
        tag_dict.add(tag)
        
print(len(tag_dict))
tag_noLemma_dict_list = list(tag_dict)
tag_noLemma_dict_list.sort()
tag_noLemma_dict_list[0:20]


101


['accessory',
 'airplane',
 'animal',
 'apple',
 'appliance',
 'backpack',
 'ball',
 'banana',
 'baseball',
 'bat',
 'bear',
 'bed',
 'bench',
 'bicycle',
 'bird',
 'boat',
 'book',
 'bottle',
 'bowl',
 'broccoli']

In [96]:
### BOW with categories no Lemmatization

Y_train_noLemma, Y_test_noLemma = vectorize(train_tag_noLemma, test_tag_noLemma)
Y_train_binary_noLemma, Y_test_binary_noLemma = vectorize(train_tag_noLemma, test_tag_noLemma, binary=True)

print(Y_train.shape, Y_train.shape)

(10000, 101) (10000, 101)


In [97]:
### BOW without categories 

_tag_noLemma_dict_list = tag_noLemma_dict_list
tag_dict = set()
for s in train_tag_nocat_noLemma['sentence']:
    tags = s.split()
    for tag in tags:
        tag_dict.add(tag)
        
print(len(tag_dict))
tag_noLemma_dict_list = list(tag_dict)
tag_noLemma_dict_list.sort()
tag_noLemma_dict_list[0:20]



92


['airplane',
 'apple',
 'backpack',
 'ball',
 'banana',
 'baseball',
 'bat',
 'bear',
 'bed',
 'bench',
 'bicycle',
 'bird',
 'boat',
 'book',
 'bottle',
 'bowl',
 'broccoli',
 'bus',
 'cake',
 'car']

In [100]:
### BOW without categories no lemmatization

Y_train_nocat_noLemma, Y_test_nocat_noLemma = vectorize(train_tag_nocat_noLemma, test_tag_nocat_noLemma)
Y_train_nocat_binary_noLemma, Y_test_nocat_binary_noLemma = vectorize(train_tag_nocat_noLemma, test_tag_nocat_noLemma, binary=True)

print(Y_train_nocat.shape, Y_test_nocat.shape)

(10000, 92) (2000, 92)


In [102]:
if not os.path.isdir("data/processed_tags"):
    os.mkdir("data/processed_tags")

scipy.sparse.save_npz("data/processed_tags/train_tag_tfdif.npz", Y_train)
scipy.sparse.save_npz("data/processed_tags/test_tag_tfdif.npz", Y_test)
scipy.sparse.save_npz("data/processed_tags/train_tag_nocat_tfdif.npz", Y_train_nocat)
scipy.sparse.save_npz("data/processed_tags/test_tag_nocat_tfdif.npz", Y_test_nocat)


scipy.sparse.save_npz("data/processed_tags/train_tag_tfdif_binary.npz", Y_train_binary)
scipy.sparse.save_npz("data/processed_tags/test_tag_tfdif_binary.npz", Y_test_binary)
scipy.sparse.save_npz("data/processed_tags/train_tag_nocat_tfdif_binary.npz", Y_train_nocat_binary)
scipy.sparse.save_npz("data/processed_tags/test_tag_nocat_tfdif_binary.npz", Y_test_nocat_binary)

with open('data/processed_tags/tag_list.txt', 'w') as f:
    for item in tag_dict_list:
        f.write("%s\n" % item)

with open('data/processed_tags/tag_list_.txt', 'w') as f:
    for item in _tag_dict_list:
        f.write("%s\n" % item)

In [103]:
if not os.path.isdir("data/processed_tags"):
    os.mkdir("data/processed_tags")

scipy.sparse.save_npz("data/processed_tags/train_tag_tfdif_noLemma.npz", Y_train_noLemma)
scipy.sparse.save_npz("data/processed_tags/test_tag_tfdif_noLemma.npz", Y_test_noLemma)
scipy.sparse.save_npz("data/processed_tags/train_tag_nocat_tfdif_noLemma.npz", Y_train_nocat_noLemma)
scipy.sparse.save_npz("data/processed_tags/test_tag_nocat_tfdif_noLemma.npz", Y_test_nocat_noLemma)


scipy.sparse.save_npz("data/processed_tags/train_tag_tfdif_binary_noLemma.npz", Y_train_binary_noLemma)
scipy.sparse.save_npz("data/processed_tags/test_tag_tfdif_binary_noLemma.npz", Y_test_binary_noLemma)
scipy.sparse.save_npz("data/processed_tags/train_tag_nocat_tfdif_binary_noLemma.npz", Y_train_nocat_binary_noLemma)
scipy.sparse.save_npz("data/processed_tags/test_tag_nocat_tfdif_binary_noLemma.npz", Y_test_nocat_binary_noLemma)

with open('data/processed_tags/tag_noLemma_list.txt', 'w') as f:
    for item in tag_noLemma_dict_list:
        f.write("%s\n" % item)

with open('data/processed_tags/tag_noLemma_list_.txt', 'w') as f:
    for item in _tag_noLemma_dict_list:
        f.write("%s\n" % item)