In [3]:
import os
import glob
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
def load_descriptions(path):
    data = []
    for filename in glob.glob(os.path.join(path, "*.txt")):
        with open(filename, "r") as description_file:
            label = os.path.splitext(os.path.basename(filename))[0]
            sentence = " ".join(line.strip() for line in description_file)
            new_row = (sentence, label) 
            data.append(new_row)

    dt = np.dtype([('sentence', object), ('label', object)])
    return(np.array(data, dtype = dt))

In [5]:
train_descriptions_path = "data/descriptions_train/"
test_descriptions_path = "data/descriptions_train/"

train_data = load_descriptions(train_descriptions_path)
test_data = load_descriptions(test_descriptions_path)

In [None]:
print(train_data.shape, test_data.shape)
print(train_data[0:10])

In [None]:
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))
wnl = nltk.stem.WordNetLemmatizer()

def preprocess(data):
    data['sentence'] = list(map(lambda x:x.lower(), data['sentence']))
    data['sentence'] = list(map(lambda x:''.join(ch for ch in x if ch not in punctuation), data['sentence']))
    data['sentence'] = list(map(lambda x:' '.join(w for w in x.split(' ') if w not in stop_words), data['sentence']))
    data['sentence'] = list(map(lambda x:' '.join(wnl.lemmatize(w) for w in x.split(' ')), data['sentence']))
    return(data)

In [151]:
train_data = preprocess(train_data)
test_data =  preprocess(test_data)

[nltk_data] Downloading package stopwords to /Users/walid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/walid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [154]:
print(train_data.shape, test_data.shape)
print(train_data[0:10])

<1x28 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [None]:
word_dict = set()
for s in train_data['sentence']:
    words = s.split()
    for w in words:
        word_dict.add(w)

In [None]:
print(len(word_dict))
list(word_dict)[0:10]

In [None]:
def vectorize(train_data, test_data):
    
    CntVec = CountVectorizer(vocabulary = word_dict, 
                             tokenizer = lambda str: str.split(" "))
    
    tr_d = [word for word in train_data['sentence'].tolist()] 
    te_d = [word for word in test_data['sentence'].tolist()] 
    
    X_train = CntVec.fit_transform(tr_d)
    X_test = CntVec.fit_transform(te_d)
    
    return(X_train, X_test)

In [None]:
X_train, X_test = vectorize(train_data, test_data)
print(X_train.shape, X_test.shape)

In [None]:
print(train_data[0])
print(X_train[0])