In [131]:
#Importing and cleaning the data
#Dataset- DUC2001
#Importing the libraries
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import re

In [139]:

def read_DUC_document(path: str):
    '''
    Read a DUC document from a given path.
    '''
    exception_count = 0
    files = os.listdir(os.path.join(path, 'documents'))
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
    DUC = pd.DataFrame(columns=['Document', 'Summary', 'Text'])
    for i in range(len(files)):
        print('cleaning document: ', files[i])
        with open(os.path.join(path,'documents',files[i]),'r') as f:
            file = f.read()
            # file = file.replace('\n',' ')
            file = file.replace('\'','')
            file = file.replace('\"','')
            file = file.replace('`','')
            file = RegexpTokenizer('<TEXT>(.*?)</TEXT>').tokenize(file)
            file = " ".join(file)
            # file = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", file)
            file = file.lower()
            file = file.split()
            file = [w for w in file if not w in stop_words]
            file = [ps.stem(w) for w in file]
            file = [lemmatizer.lemmatize(w) for w in file]
            file = [w for w in file if not w in stop_words]
            file = " ".join(file)
            try:
                with open(os.path.join(path,'Summaries',files[i].lower()+'.txt'),'r') as summary_reader:
                    summary = summary_reader.readline()
                    summary = summary_reader.readline()
                    summary = summary.replace('\n','')
                    summary = summary.replace('Abstract:','')
            except Exception as e:
                exception_count += 1
                summary = ''
            DUC.loc[i] = [files[i], summary, file]
    print('Number of exceptions: ', exception_count)
    return DUC



In [140]:
duc_data = read_DUC_document(path='./DUC2001/')

cleaning document:  AP830325-0143
cleaning document:  AP880217-0175
cleaning document:  AP880318-0051
cleaning document:  AP880330-0119
cleaning document:  AP880331-0140
cleaning document:  AP880409-0015
cleaning document:  AP880419-0131
cleaning document:  AP880510-0178
cleaning document:  AP880517-0226
cleaning document:  AP880520-0264
cleaning document:  AP880601-0040
cleaning document:  AP880613-0161
cleaning document:  AP880623-0135
cleaning document:  AP880629-0159
cleaning document:  AP880630-0295
cleaning document:  AP880705-0006
cleaning document:  AP880705-0018
cleaning document:  AP880705-0109
cleaning document:  AP880714-0142
cleaning document:  AP880801-0195
cleaning document:  AP880811-0299
cleaning document:  AP880816-0234
cleaning document:  AP880901-0052
cleaning document:  AP880902-0062
cleaning document:  AP880903-0092
cleaning document:  AP880913-0129
cleaning document:  AP880913-0204
cleaning document:  AP880914-0027
cleaning document:  AP880914-0079
cleaning docum

Let's inspect our dataframe

In [122]:
duc_data.head()

Unnamed: 0,Document,Summary,Text
0,AP830325-0143,,million gallon crude oil spill tanker ran agro...
1,AP880217-0175,Some 40 members of Congress have joined with t...,coalit member congress announc wednesday plan ...
2,AP880318-0051,"Multitudes of native peoples, tourists and sci...","thousand peol prayed, cheered, danced, beat dr..."
3,AP880330-0119,Population experts say that little would chang...,"two side tri forc chang 1990 censu get way, re..."
4,AP880331-0140,The unofficial tornado season runs from April ...,rumbl spring thunderstorm announc begin unoffi...


In [130]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import string
def clean_text(sentence):
    # remove non alphabetic sequences
    pattern = re.compile(r'[^a-z]+')
    sentence = sentence.lower()
    sentence = str.lower(pattern.sub(' ', sentence).strip())
    
    # Tokenize
    word_list = word_tokenize(sentence)
    
    # stop words
    stopwords_list = set(stopwords.words('english'))
    # puctuation
    punct = set(string.punctuation)
    
    # remove stop words
    word_list = [word for word in word_list if word not in stopwords_list]
    # remove very small words, length < 3
    # they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 2]
    # remove punctuation
    word_list = [word for word in word_list if word not in punct]
    
    # stemming
    ps  = PorterStemmer()
    word_list = [ps.stem(word) for word in word_list]
    
    # lemmatize
    lemma = WordNetLemmatizer()
    word_list = [lemma.lemmatize(word) for word in word_list]
    # list to sentence
    sentence = ' '.join(word_list)
    
    return sentence



100%|██████████| 309/309 [00:06<00:00, 49.29it/s]


In [141]:
# we'll use tqdm to monitor progress of data cleaning process
# create tqdm for pandas
tqdm.pandas()
# clean text data

duc_data['Text'] = duc_data['Text'].progress_apply(lambda x: clean_text(str(x)))

100%|██████████| 309/309 [00:07<00:00, 43.00it/s]


In [142]:
duc_data['Summary'] = duc_data['Summary'].progress_apply(lambda x: clean_text(str(x)))

100%|██████████| 309/309 [00:01<00:00, 224.49it/s]
