In [1]:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

import os

import re
import itertools

import pandas as pd

from tqdm import tqdm

In [2]:
path = 'C:/Users/VENNELA/Downloads/CNN_with_textdata/docs'

files = os.listdir(path)

In [3]:
def filter_person(chunk):
    txt=[]
    for i in chunk:
        if (type(i)==Tree):
            if(i.label()=='GPE'):
                m=[]
                for j in i:
                    m.append(j[0])
                txt.append('_'.join(m))
        else:
            txt.append(i[0])
    return ' '.join(txt)

In [4]:
def preprocess(input_text):
        
    text = input_text
    
    # 1. Email
    emails = re.findall('([a-zA-Z0-9\.\-_]*@[a-zA-Z0-9\._-]*)', text)
    email_domains = re.findall('[a-zA-Z0-9\.\-_]*@([a-zA-Z0-9\._-]*)', text)
    domain = []
    for j in email_domains:
        domain.append(j.split('.'))
       
    sub_domain = [[sub for sub in ele if len(sub) > 2 and sub != 'com'] for ele in domain] 

    email_ext = list(itertools.chain(*sub_domain))
    email = " ".join(email_ext)
    
    # 2. Replace all the emails by space in the original text
    text = re.sub(r'([a-zA-Z0-9\.\-_]*@[a-zA-Z0-9\._-]*)','', text)
    
    # 3. Subject
    subject = re.findall(r'Subject:.*',text)
    subject = subject[0].split(": ")[-1]
    subject = re.sub('[^\w\s]|\r|\t|\n','', subject)
    subject = re.sub('\s\s',' ', subject).strip()
        
    # 4. Replace Subject by space in the original text 
    text = re.sub(r'Subject:.*',"",text)
    
    # 5. Delete all the sentences where sentence starts with "Write to:" or "From:"   
    text = re.sub(r'Write to:.*|From:.*',"",text)
    
    # 6. Delete all the tags like "< anyword >"
    text = re.sub(r'<.*>',"",text)
    
    # 7. Delete all the data which are present in the brackets. 
    text = re.sub(r'\(.*\)', "", text)
    
    # 8. Remove all the "-", newlines('\n'), tabs('\t'), "\".
    text = re.sub(r'[\-|\n|\t|\\|\s]', " ", text)
    
    # 9. Remove all the words which ends with ":".
    text = re.sub(r'\w*:', "", text)
    
    # 10. Decontractions, replace words like below to full words. 
    # CODE Reference - https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"shouldn\'t", "should not", text)
    text = re.sub(r"wouldn\'t", "would not", text)
    text = re.sub(r"isn\'t", "is not", text)
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"n\'t", "not", text)
    text = re.sub(r"it\'s", "it is", text)
    text = re.sub(r"let\'s", "let us", text)
    text = re.sub(r"\'ve", "have", text)
    text = re.sub(r"\'m", "am", text)
    text = re.sub(r"\'re", "are", text)
    text = re.sub(r"\'ve", "have", text)
    text = re.sub(r"\'ll", "will", text)
    text = re.sub(r"'cause", "because", text)
    text = re.sub(r"ma'am", "madam",text)
    text = re.sub(r"\'d", "had", text)
    text = re.sub(r"shan't", "shall not", text)
        
    # 11. Chunking
    
    chunks = ne_chunk(pos_tag(word_tokenize(text)), binary = True)
    text = filter_person(chunks)    
    
    # 12. Delete all the special characters
    text = re.sub(r'[><,!\*\.\?;=\()/\$`\'\[|\]]',"",text)
    
    # 13. Delete all the digits.
    text = re.sub(r'[0-9]', "", text)
    
    # 14. Remove the _ from '_word_', '_word', 'word_' words. 
#     text = re.sub(r'_([a-zA-z]+)_', r'\1', text) # replace _word_ to word
    text = re.sub(r'(_?)([a-zA-z]+)(_?)', r'\2', text) # replace _word to word
    
    text = re.sub(r"\s+"," ",text)
    
    # 15. Remove the words which are <= 2 in length after spliting words by "_".
    # CODE Reference - https://stackoverflow.com/questions/24332025/remove-words-of-length-less-than-4-from-string
    text = re.sub(r'\s_|_\s|\"', '', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    
    # 16. Convert all words into lower case and remove words which are >= 15 in length.
    text = re.sub(r'\b\w{15,}\b', '', text)
    text = text.lower()
    
    # 17. Replace all the words except "A-Za-z_" with space. 
    text = re.sub(r'[^A-Za-z_\s]*', "", text)
    
    # 18. return preprocessed email, subject, text.
    return email, subject, text

In [5]:
classes = []
p_emails = []
p_subject = []
p_text = []

for file in tqdm(files):
    with open('docs/' +file, encoding = "utf8", errors = 'ignore') as document:
        classes.append(file.split('_')[0])
        email, subject, text = preprocess(document.read())
        p_emails.append(email)
        p_subject.append(subject)
        p_text.append(text)
len(classes), len(p_emails), len(p_subject), len(p_text)

100%|████████████████████████████████████████████████████████████████████████████| 18828/18828 [20:34<00:00, 15.25it/s]


(18828, 18828, 18828, 18828)

In [6]:
documents_name = os.listdir('./docs')

data = []
final_data = []

for doc in documents_name:
    with open('./docs/' +str(doc), 'r', encoding = 'utf8', errors = 'replace')as file:
        file_content = file.read()
        data.append(file_content)
        
final_data.append(data)

In [7]:
data = pd.DataFrame({"text": final_data[0], 
                     "class": classes, 
                     "preprocessed_email" : p_emails, 
                     "preprocessed_subject" : p_subject,
                     "preprocessed_text": p_text})

In [8]:
data

Unnamed: 0,text,class,preprocessed_email,preprocessed_subject,preprocessed_text
0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,alt.atheism,mantis netcom mantis,Atheist Resources,archive alt atheism archive resources last de...
1,From: mathew <mathew@mantis.co.uk>\nSubject: A...,alt.atheism,mantis mantis mantis,Introduction to Atheism,alt atheism archive introduction last april p...
2,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,alt.atheism,dbstu1 tu-bs mimsy umd edu umd edu,Gospel Dating,article well has quite different not necessa...
3,From: mathew <mathew@mantis.co.uk>\nSubject: R...,alt.atheism,mantis kepler unh edu,university violating separation of churchstate,recently have been ordered and none have resi...
4,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...,alt.atheism,Watson Ibm Com harder ccr-p ida org harder ccr...,socmotss et al Princeton axes matching funds f...,article however hate economic terrorism and ...
...,...,...,...,...,...
18823,From: sbuckley@fraser.sfu.ca (Stephen Buckley)...,talk.religion.misc,fraser sfu magnus acs ohio-state edu,Religion and marriage,wasnot sure this was the right newsgroup p...
18824,From: bakerj@gtephx.UUCP (Jon Baker)\nSubject:...,talk.religion.misc,gtephx UUCP ifi uio ifi uio ncratl AtlantaGA N...,How do you know what happened,article article probably not but then dono...
18825,From: pharvey@quack.kfu.com (Paul Harvey)\nSub...,talk.religion.misc,quack kfu emx utexas edu emx utexas edu,Why did they behave as they did Wacoreading su...,article you would like understand better th...
18826,From: <KEVXU@CUNYVM.BITNET>\nSubject: Re: Info...,talk.religion.misc,CUNYVM BITNET digi lonestar org digi lonestar org,Info about New Age,article the danger anti cult groups that wh...


_The output of the input text in 'alt.atheism_49960' document after preprocessing._

In [9]:
print(data['preprocessed_email'][0])

mantis netcom mantis


In [10]:
print(data['preprocessed_subject'][0])

Atheist Resources


In [11]:
print(data['preprocessed_text'][0])

archive  alt atheism archive resources last december  usa freedom from religion foundation darwin fish bumper stickers and assorted other atheist paraphernalia are available from the  the evolution designs evolution designs sell the darwin fish    fish symbol like the ones christians stick  their cars but with feet and the word darwin written inside the deluxe moulded  plastic fish  postpaid  the  people  the area can get from try mailing for net people who   directly the price  per fish press aap publish various atheist books critiques  the lists  contradictions and   one such book the   ball and  foote  isbn  edition bible contradictions absurdities atrocities immoralities contains the based  the king james version  the bible  prometheus books sell books including  holy horrors  alternate address prometheus books glenn drive  african humanism  organization promoting black secular humanism and uncovering the history  black freethought they publish  quarterly newsletter  press associat

In [12]:
data.to_pickle("./preprocessed_data")

In [13]:
# https://www.codegrepper.com/code-examples/python/read+a+text+file+in+python+and+dump+the+contents

my_dataframe = pd.read_pickle("preprocessed_data")

In [14]:
my_dataframe.shape

(18828, 5)

In [15]:
my_dataframe

Unnamed: 0,text,class,preprocessed_email,preprocessed_subject,preprocessed_text
0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,alt.atheism,mantis netcom mantis,Atheist Resources,archive alt atheism archive resources last de...
1,From: mathew <mathew@mantis.co.uk>\nSubject: A...,alt.atheism,mantis mantis mantis,Introduction to Atheism,alt atheism archive introduction last april p...
2,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,alt.atheism,dbstu1 tu-bs mimsy umd edu umd edu,Gospel Dating,article well has quite different not necessa...
3,From: mathew <mathew@mantis.co.uk>\nSubject: R...,alt.atheism,mantis kepler unh edu,university violating separation of churchstate,recently have been ordered and none have resi...
4,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...,alt.atheism,Watson Ibm Com harder ccr-p ida org harder ccr...,socmotss et al Princeton axes matching funds f...,article however hate economic terrorism and ...
...,...,...,...,...,...
18823,From: sbuckley@fraser.sfu.ca (Stephen Buckley)...,talk.religion.misc,fraser sfu magnus acs ohio-state edu,Religion and marriage,wasnot sure this was the right newsgroup p...
18824,From: bakerj@gtephx.UUCP (Jon Baker)\nSubject:...,talk.religion.misc,gtephx UUCP ifi uio ifi uio ncratl AtlantaGA N...,How do you know what happened,article article probably not but then dono...
18825,From: pharvey@quack.kfu.com (Paul Harvey)\nSub...,talk.religion.misc,quack kfu emx utexas edu emx utexas edu,Why did they behave as they did Wacoreading su...,article you would like understand better th...
18826,From: <KEVXU@CUNYVM.BITNET>\nSubject: Re: Info...,talk.religion.misc,CUNYVM BITNET digi lonestar org digi lonestar org,Info about New Age,article the danger anti cult groups that wh...


In [16]:
# CODE Reference - https://datascientyst.com/combine-multiple-columns-into-single-one-in-pandas/

cols = ['preprocessed_email', 'preprocessed_subject', 'preprocessed_text']

my_dataframe['preprocessed_data'] = my_dataframe[cols].agg(lambda x: ' '.join(x.values), axis=1).T

In [17]:
my_dataframe.shape

(18828, 6)

In [18]:
my_dataframe.head()

Unnamed: 0,text,class,preprocessed_email,preprocessed_subject,preprocessed_text,preprocessed_data
0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,alt.atheism,mantis netcom mantis,Atheist Resources,archive alt atheism archive resources last de...,mantis netcom mantis Atheist Resources archive...
1,From: mathew <mathew@mantis.co.uk>\nSubject: A...,alt.atheism,mantis mantis mantis,Introduction to Atheism,alt atheism archive introduction last april p...,mantis mantis mantis Introduction to Atheism ...
2,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,alt.atheism,dbstu1 tu-bs mimsy umd edu umd edu,Gospel Dating,article well has quite different not necessa...,dbstu1 tu-bs mimsy umd edu umd edu Gospel Dati...
3,From: mathew <mathew@mantis.co.uk>\nSubject: R...,alt.atheism,mantis kepler unh edu,university violating separation of churchstate,recently have been ordered and none have resi...,mantis kepler unh edu university violating sep...
4,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...,alt.atheism,Watson Ibm Com harder ccr-p ida org harder ccr...,socmotss et al Princeton axes matching funds f...,article however hate economic terrorism and ...,Watson Ibm Com harder ccr-p ida org harder ccr...
