In [151]:
import os
import glob
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer



def load_data(path):
    data = []
    for filename in glob.glob(os.path.join(path, "*.txt")):
        with open(filename, "r") as description_file:
            label = int(filename[len(path):-4])
            sentence = " ".join(line.strip() for line in description_file)
            newrow = (sentence,label) 
            data.append(newrow)

    dt = np.dtype([('sentence', object), ('label', int)])
    return(np.array(data,dtype=dt))

            
nltk.download('stopwords')
stopwords.words('english')
nltk.download('wordnet')


def preprocess(data):
    stop_words = set(stopwords.words('english'))
    data['sentence'] = list(map(lambda x:x.lower(), data['sentence']))
    punctuation = set(string.punctuation)
    data['sentence'] = list(map(lambda x:''.join(ch for ch in x if ch not in punctuation), data['sentence']))
    data['sentence'] = list(map(lambda x:' '.join(w for w in x.split(' ') if w not in stop_words), data['sentence']))
    wnl = nltk.stem.WordNetLemmatizer()
    data['sentence'] = list(map(lambda x:' '.join(wnl.lemmatize(w) for w in x.split(' ')), data['sentence']))
    return(data)
    

def vectorize(train_data, test_data):
    word_dict = set()
    for s in train_data['sentence']:
      words = s[0].split()
      for w in words:
        word_dict.add(w)

    # generate feature matrix
    CntVec = CountVectorizer(vocabulary = word_dict, 
                 tokenizer = lambda str: str.split(" "))
    tr_d = [item for sublist in train_data['sentence'].tolist() for item in sublist] 
    te_d = [item for sublist in test_data['sentence'].tolist() for item in sublist] 
    X_train = CntVec.fit_transform(tr_d)
    X_test = CntVec.fit_transform(te_d)
    return(X_train, X_test)
    
path = "/Users/walid/Documents/Applied_Machine_Learning/Final/Data/descriptions_train/"
train_data = load_data(path)
path = "/Users/walid/Documents/Applied_Machine_Learning/Final/Data/descriptions_test/"
test_data = load_data(path)


train_data = preprocess(train_data)
test_data =  preprocess(test_data)

X_train, X_test = vectorize(train_data, test_data)

[nltk_data] Downloading package stopwords to /Users/walid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/walid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<1x28 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>