In [89]:
import numpy as np
import pandas as pd
import sys
import os
import glob
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [1]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords

In [101]:
resnet_train = pd.read_csv("data/features_train/features_resnet1000_train.csv", header = None, index_col = 0)
# resnet_train = pd.read_csv("data/features_train/features_resnet1000intermediate_train.csv", header = None, index_col = 0)

resnet_test = pd.read_csv("data/features_test/features_resnet1000_test.csv", header = None, index_col = 0)
# resnet_test = pd.read_csv("data/features_test/features_resnet1000intermediate_test.csv", header = None, index_col = 0)

print(resnet_train.shape, resnet_test.shape)

(10000, 1000) (2000, 1000)


In [105]:
print(resnet_train[0:10])

                           1         2         3         4         5     \
0                                                                         
images_train/5373.jpg -0.899450 -0.930470 -2.503365 -3.172499 -2.819133   
images_train/984.jpg  -1.346954 -3.119461 -0.765971 -1.382550 -1.104675   
images_train/7127.jpg -3.445498 -1.524573 -1.001654 -3.668335 -1.805517   
images_train/9609.jpg  1.114650 -2.167102  0.097881 -1.336255  0.853483   
images_train/5293.jpg  1.602650 -1.505817  3.029409  4.092412  1.711755   
images_train/3688.jpg -1.598056 -3.197755 -1.103384 -0.969265 -0.756076   
images_train/3340.jpg  3.967041 -2.200422 -0.272736  1.695661  2.074336   
images_train/4787.jpg -1.349009 -0.148385 -1.723681 -2.070523 -2.659292   
images_train/5707.jpg  0.676531 -1.743760  3.400268  2.481813  2.522729   
images_train/1262.jpg -3.897769  0.144037 -3.280854 -3.507650 -0.995550   

                           6         7         8         9         10    \
0                       

In [2]:
def load_descriptions(path):
    data = []
    for filename in glob.glob(os.path.join(path, "*.txt")):
        with open(filename, "r") as description_file:
            label = os.path.splitext(os.path.basename(filename))[0]
            sentence = " ".join(line.strip() for line in description_file)
            new_row = (sentence, label) 
            data.append(new_row)

    dt = np.dtype([('sentence', object), ('label', object)])
    return(np.array(data, dtype = dt))

In [79]:
train_descriptions_path = "data/descriptions_train/"
test_descriptions_path = "data/descriptions_test/"

raw_train_data = load_descriptions(train_descriptions_path)
raw_test_data = load_descriptions(test_descriptions_path)

In [80]:
print(raw_train_data.shape, raw_test_data.shape)
print(raw_train_data[0:10])

(10000,) (2000,)
[('The skateboarder is putting on a show using the picnic table as his stage. A skateboarder pulling tricks on top of a picnic table. A man riding on a skateboard on top of a table. A skate boarder doing a trick on a picnic table. A person is riding a skateboard on a picnic table with a crowd watching.', '0')
 ('A bowl of soup that has some carrots, shrimp, and noodles in it. The healthy food is in the bowl and ready to eat. Soup has carrots and shrimp in it as it sits next to chopsticks. A tasty bowl of ramen is served for someone to enjoy. Bowl of Asian noodle soup, with shrimp and carrots.', '1')
 ('A bunch of luggage laying on an area rug. Several pieces of luggage on a floor with an area rug. The luggage is sitting on top of the persian rug. a bunch of travel bags sit on a carpet floor Several pieces of luggage that are laying on the floor.', '10')
 ('The browned cracked crust of a baked berry pie. A brown crust of pie with strawberry filing. The top of a pie look

In [81]:
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))
keep_pos_nouns = ['NN', 'NNS', 'NNP', 'NNPS']
keep_pos_all = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'VB', 'VBG', 'VBZ']
wnl = nltk.stem.WordNetLemmatizer()

def preprocess(data, pos_to_keep = None):
    
    out = data.copy()
    out['sentence'] = list(map(lambda x:x.lower(), out['sentence']))
    out['sentence'] = list(map(lambda x:''.join(ch for ch in x if ch not in punctuation), out['sentence']))
    out['sentence'] = list(map(lambda x:' '.join(w for w in x.split(' ') if w not in stop_words), out['sentence']))
    out['sentence'] = list(map(lambda x:' '.join(wnl.lemmatize(w) for w in x.split(' ')), out['sentence']))
    
    if pos_to_keep != None:
        out['sentence'] = list(map(lambda x:' '.join(w[0] for w in nltk.pos_tag(nltk.word_tokenize(x)) if w[1] in pos_to_keep), out['sentence']))
    
    return(out)

In [82]:
train_data = preprocess(raw_train_data, keep_pos_all)
test_data =  preprocess(raw_test_data, keep_pos_all)

In [83]:
print(train_data.shape, test_data.shape)
print(train_data[0:10])

(10000,) (2000,)
[('skateboarder putting show using picnic table stage skateboarder pulling trick top picnic table man riding skateboard top table skate boarder trick picnic table person riding skateboard picnic table crowd watching', '0')
 ('bowl soup carrot shrimp noodle healthy food bowl ready eat soup carrot shrimp sits chopstick tasty bowl ramen someone enjoy bowl asian noodle soup shrimp carrot', '1')
 ('bunch luggage laying area several piece luggage floor area rug luggage sitting top persian rug bunch travel bag sit carpet floor several piece luggage laying floor', '10')
 ('cracked crust berry brown crust pie strawberry filing top pie look good cooked fruit muffin image sort flaky pastery display', '100')
 ('sign clear view traffic light juice theory sign red traffic light bright sign hanging next traffic signal sign electrical post juice sign reading juice theory traffic light', '1000')
 ('cat sleeping duffel bag orange cat black duffel bag cat sitting bag small dog packes bag

In [84]:
word_dict = set()
for s in train_data['sentence']:
    words = s.split()
    for w in words:
        word_dict.add(w)

In [90]:
print(len(word_dict))
list(word_dict)[0:10]

7038


['ludicrous',
 'lacking',
 'roast',
 'trek',
 'mountainous',
 'thre',
 'swim',
 'patriotic',
 'humorous',
 'wellmade']

In [91]:
def vectorize(train_data, test_data):
    
    Tfidf = TfidfVectorizer(vocabulary = word_dict, 
                            tokenizer = lambda str: str.split(" "))
    
    tr_d = [word for word in train_data['sentence'].tolist()] 
    te_d = [word for word in test_data['sentence'].tolist()] 
    
    X_train = Tfidf.fit_transform(tr_d)
    X_test = Tfidf.fit_transform(te_d)
    
    return(X_train, X_test)

In [92]:
X_train, X_test = vectorize(train_data, test_data)
print(X_train.shape, X_test.shape)

(10000, 7038) (2000, 7038)


In [93]:
print(train_data[0])
print(X_train[0])

('skateboarder putting show using picnic table stage skateboarder pulling trick top picnic table man riding skateboard top table skate boarder trick picnic table person riding skateboard picnic table crowd watching', '0')
  (0, 6783)	0.12053926958581132
  (0, 6619)	0.12127189129739563
  (0, 6414)	0.24667824208973552
  (0, 6319)	0.14277759161319625
  (0, 6091)	0.3853043493472325
  (0, 5802)	0.18503232966603594
  (0, 5522)	0.26959294300622977
  (0, 5521)	0.23031291251512154
  (0, 5519)	0.1277550086027136
  (0, 5433)	0.13522916887226213
  (0, 5027)	0.17069388821130024
  (0, 4796)	0.15739505742327461
  (0, 4772)	0.13727613164596256
  (0, 4449)	0.6510591890903679
  (0, 4412)	0.07473838331697687
  (0, 3575)	0.06230190293115674
  (0, 1490)	0.12361014704073833
  (0, 628)	0.158491376463526
