In [1]:

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt') 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os
import string
import copy
import pickle

[nltk_data] Downloading package stopwords to /Users/zeez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/zeez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
title = "20_newsgroups"
os.chdir("/Users/zeez/Desktop/20_newsgroups")

In [3]:
paths = []
for (dirpath, dirnames, filenames) in os.walk(str(os.getcwd())+'/'+title+'/'):
    for i in filenames:
        paths.append(str(dirpath)+str("/")+i)

In [4]:
print(dirpath)


/Users/zeez/Desktop/20_newsgroups/20_newsgroups/alt.atheism


In [5]:
#Removing stop words 
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
    return np.char.strip(new_text) 

#Removing punctuation
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, " ", " ")
    data = np.char.replace(data, ',', '')
    return data 

#Convert to lowercase
def convert_lower_case(data):
    return np.char.lower(data)

#Stemming
def stemming(data):
    stemmer= PorterStemmer()

    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return np.char.strip(new_text) 

#Converting numbers to its equivalent words 
def convert_numbers(data):
    data = np.char.replace(data, "0", " zero ")
    data = np.char.replace(data, "1", " one ")
    data = np.char.replace(data, "2", " two ")
    data = np.char.replace(data, "3", " three ")
    data = np.char.replace(data, "4", " four ")
    data = np.char.replace(data, "5", " five ")
    data = np.char.replace(data, "6", " six ")
    data = np.char.replace(data, "7", " seven ")
    data = np.char.replace(data, "8", " eight ")
    data = np.char.replace(data, "9", " nine ")
    return data 

#Removing header 
def remove_header(data):
    try:
        ind = data.index('\n\n')
        data = data[ind:]
    except:
        print("No Header")
    return data 

#Removing apostrophe 
def remove_apostrophe(data):
    return np.char.replace(data, "'", "") 

#Removing single characters 
def remove_single_characters(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

In [6]:

def preprocess(data, query):
    if not query:
        data = remove_header(data) 
        data = convert_lower_case(data)
        data = convert_numbers(data)
        data = remove_punctuation(data)
        data = remove_stop_words(data)
        data = remove_apostrophe(data)
        data = remove_single_characters(data)
        data = stemming(data) 
    return data

In [7]:
paths[0]

'/Users/zeez/Desktop/20_newsgroups/20_newsgroups/alt.atheism/51120'

In [8]:
doc = 0
postings = pd.DataFrame()

for path in paths:
    file = open(path, 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    preprocessed_text = preprocess(text, False)
    
    #Genrate matrex posting list
    if doc%100 == 0:
        print(doc)
    tokens = word_tokenize(str(preprocessed_text))
    for token in tokens:
        if token in postings:
            p = postings[token][0]
            p.add(doc)
            postings[token][0] = p 
        else:
            postings.insert(value=[{doc}], loc=0, column=token)
    doc += 1 

#Save the output:
postings.to_pickle(title + "_unigram_postings")

0


In [9]:
postings


Unnamed: 0,purdu,ecn,smullin,scott,prose,unclear,accus,worst,misinterpret,mere,...,ra,recent,write,philosoph,becom,king,edu,unh,kepler,dmn
0,{18},{18},{18},{18},{18},{18},{18},{18},{18},{18},...,{0},"{0, 18, 6}","{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",{0},"{0, 1}","{0, 18}","{0, 4, 7, 10, 11, 14, 15, 17, 18}",{0},{0},{0}


In [10]:
def get_word_postings(word):
    preprocessed_word=str(preprocess(word,True))
    print(preprocessed_word)
    print("frequency:", len(postings[preprocessed_word][0]))
    print("postings list:",postings[preprocessed_word][0])

get_word_postings("nine")
    

nine
frequency: 5
postings list: {5, 7, 8, 12, 15}


In [11]:
def mylistdir(directory):
    """A specialized version of os.listdir() that ignores files that
    start with a leading period."""
    filelist = os.listdir(directory)
    return [x for x in filelist
            if not (x.startswith('.'))]

In [12]:
s1 = postings['one'][0]
s2 = postings['nine'][0]
s3 = postings['exam'][0]
print(s1)
print(s2)
print(s3)

print('one AND nine AND exam = ', s1 & s2 & s3)

{0, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18}
{5, 7, 8, 12, 15}
{14}
one AND nine AND exam =  set()
