In [3]:
import pandas as pd
import math
import copy
import numpy as np 
import itertools
import more_itertools as mit
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
import string
import re

## Corpus Preprocessing

In [1]:
def preprocess_corpus(corpus):
    
    ps = PorterStemmer()
    
    def stemSentence(sentence,ps):
        token_words = word_tokenize(sentence)
        stem_sentence = []
        for word in token_words:
            stem_sentence.append(ps.stem(word))
            stem_sentence.append(" ")
        return "".join(stem_sentence)
    
    #apply function
    corpus['TEXT'] = corpus.apply(lambda x: stemSentence(x['TEXT'],ps), axis=1)
    
    return corpus

 ## Query Preprocessing

In [6]:
def preprocess_queries(corpus, queries):
    
    def remove_punctuations(text): # remove punctuation
        for punctuation in string.punctuation:
            text = text.replace(punctuation, '')
        return text

    def remove_numbers(text): # remove numbers
        return re.sub('[0-9]+', '', text)
    
    def lower_case(text): # lower case
        text = text.lower()
        return text 
    
    def tokenize(text): #tokenize
        return word_tokenize(text)
    
    stop = set(stopwords.words('english'))   
    def stop_words(tokens): # stop words 
        filtered_words = []
        for word in tokens:
            if word not in stop:
                filtered_words.append(word)
        return filtered_words
    
    ps = PorterStemmer()
    def stemming(tokens, ps): # stemming
        return [ps.stem(w) for w in tokens] 
    
    def corpus_vocab(corpus):
        vocab = []
        corpus_tokens = corpus.apply(lambda x: word_tokenize(x['TEXT']), axis=1)
        for i, j in corpus_tokens.iteritems():
            for token in j:
                if token not in vocab:
                    vocab.append(token)        
        return vocab
    
    v = corpus_vocab(corpus)    
    def filter_query(tokens):
        t = []
        for token in tokens:
            if token in v:
                t.append(token)
        return t
    
    # apply functions
    queries['TEXT'] = queries.apply(lambda x: remove_punctuations(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: remove_numbers(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: lower_case(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: tokenize(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: stop_words(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: stemming(x['TEXT'],ps), axis=1)
    queries['TEXT'] = queries.apply(lambda x: filter_query(x['TEXT']), axis=1)
    
    return queries

## TEST (output preprocessing corpus and queries)

In [7]:
# load corpus 
#corpus = pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT'])

# load queries
#queries = pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT'])

#corpus = preprocess_corpus(corpus)
#corpus.head()
#queries = preprocess_queries(corpus,queries)
#queries.head()

Unnamed: 0,ID,TEXT
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s..."
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,..."
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle..."
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he..."
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu..."
