In [2]:
import os
import numpy as np
import pandas as pd
import nltk
import re
from nltk import Tree
from nltk.stem import WordNetLemmatizer

### NLP processing 

In [3]:
patterns = """
    NP:    {<DT><WP><VBP>*<RB>*<VBN><IN><NN>}
           {<NN|NNS|NNP|NNPS><IN    >*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS>+}
    """
select_type = ['NN', 'NNS', 'JJ', 'VB', 'VBD', 'VBP', 'VBZ']
typer = nltk.RegexpParser(patterns)
letter_filter = re.compile('[^a-zA-Z]')
lemmatizer = WordNetLemmatizer()

In [5]:
training_descrption_files = [os.path.join('data/descriptions_train', filename) \
                             for filename in os.listdir('data/descriptions_train') 
                             if filename.endswith('.txt')]
training_tag_files = [os.path.join('data/tags_train', filename)\
                      for filename in os.listdir('data/tags_train')\
                      if filename.endswith('.txt')]
test_descrption_files = [os.path.join('data/descriptions_test', filename) \
                             for filename in os.listdir('data/descriptions_test') 
                             if filename.endswith('.txt')]
training_descrption_files.sort(key=lambda x: int(x[x.rindex('/') + 1:-4]))
training_tag_files.sort(key=lambda x: int(x[x.rindex('/') + 1:-4]))
test_descrption_files.sort(key=lambda x: int(x[x.rindex('/') + 1:-4]))

In [6]:
def remove_empty(a):
    return [i for i in a if len(i) > 0]

In [7]:
def find_type(s, type_select):
    dictionary = set()
    sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(s)]
    sentences = [[letter_filter.sub('', sentences[i][j].lower())\
              for j in range(len(sentences[i]))] for i in range(len(sentences))]
    sentences = [remove_empty(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    for sent in sentences:
        for e in sent:
            if e[1] in type_select:
                dictionary.add(lemmatizer.lemmatize(e[0]))
    return dictionary

In [8]:
def define_Y_vector():
    selected = {}
    for file in training_descrption_files:
        sentences = [line for line in open(file, 'r+')]
        for sent in sentences:
            nvj = find_type(sent, select_type)
            for word in nvj:
                if word in selected.keys():
                    selected[word] += 1
                else:
                    selected[word] = 1
    for file in training_tag_files:
        sentences = [line for line in open(file, 'r+')]
        for sent in sentences:
            validTag = letter_filter.sub('', sent.split(':')[1])
            selected[validTag] = 100
    
    final = set()
    for k in selected.keys():
        if selected[k] >= 15 and selected[k] <= 2000:
            final.add(k)
    
    return final


In [9]:
bow_final_words = define_Y_vector()
bow_final_words

{'grocery',
 'graz',
 'desert',
 'driver',
 'one',
 'wide',
 'sleep',
 'bucket',
 'twin',
 'trafficlight',
 'rear',
 'pigeon',
 'part',
 'fly',
 'aerial',
 'hill',
 'chip',
 'sit',
 'jetliner',
 'customer',
 'clock',
 'ice',
 'throw',
 'rocky',
 'handle',
 'stoplight',
 'hardwood',
 'opened',
 'disc',
 'fish',
 'crowded',
 'get',
 'return',
 'many',
 'bath',
 'dresser',
 'country',
 'oriental',
 'size',
 'sidewalk',
 'skateboarder',
 'edge',
 'leather',
 'shallow',
 'sea',
 'dryer',
 'horseback',
 'lid',
 'action',
 'tower',
 'skirt',
 'wooden',
 'pie',
 'bottom',
 'habitat',
 'pan',
 'boarder',
 'multicolored',
 'sale',
 'turn',
 'button',
 'swing',
 'gate',
 'teddy',
 'rainbow',
 'visible',
 'groom',
 'competition',
 'event',
 'mask',
 'crosswalk',
 'design',
 'london',
 'harbor',
 'i',
 'filled',
 'heavy',
 'story',
 'fork',
 'reach',
 'full',
 'lighthouse',
 'point',
 'landing',
 'frame',
 'notebook',
 'bedroom',
 'salad',
 'pattern',
 'parrot',
 'taxi',
 'contains',
 'doubledecker

### prepare for Y in training and test sample

In [10]:
Y_training = []
for file in training_descrption_files:
    vector = dict.fromkeys(bow_final_words, 0)
    sentences = [line for line in open(file, 'r+')]
    for sent in sentences:
        for word in sent.split():
            i = letter_filter.sub('', word.lower())
            if i in bow_final_words:
                vector[i] += 1
    
    Y_training.append(list(vector.values()))


In [11]:
print(len(Y_training))
print(len(Y_training[0]))  

10000
1294


In [12]:
Y_test = []
for file in test_descrption_files:
    vector = dict.fromkeys(bow_final_words, 0)
    sentences = [line for line in open(file, 'r+')]
    for sent in sentences:
        for word in sent.split():
            i = letter_filter.sub('', word.lower())
            if i in bow_final_words:
                vector[i] += 1
    
    Y_test.append(list(vector.values()))



In [13]:
print(len(Y_test))
print(len(Y_test[0]))       

2000
1294


In [110]:
import pickle

pickle.dump( Y_training, open( "y_train.p", "wb" ) )
pickle.dump( Y_test, open( "y_test.p", "wb" ) )