In [3]:
import json
import os
import pickle
import sys
import time                                 
from utils import add_special_tokens    
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import text_hammer as th
from nltk.stem import WordNetLemmatizer

In [4]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Function for Preprocessing

In [5]:
from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def stemlem(s):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    #l=s.split()
    lem=[]
    t = str(s).lower()
    t=th.remove_emails(t)
    t = th.remove_html_tags(t)
    t = th.remove_urls(t)
    t=lemmatizer.lemmatize(t,get_wordnet_pos(t))
    #s1=" ".join(lem)
    return t
        
        

# Generating the Preprocessed Data and Storing it in Json file Format

In [6]:
#tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
dm_single_close_quote = '\u2019' # unicode
dm_double_close_quote = '\u201d'
# acceptable ways to end a sentence
END_TOKENS = ['.', '!', '?', '...', "'", "`", '"',
              dm_single_close_quote, dm_double_close_quote, ")"]


def fix_missing_period(line):
    """Adds a period to a line that is missing a period"""
    if "@highlight" in line:
        return line
    if line == "":
        return line
    if line[-1] in END_TOKENS:
        return line
    return line + " ."


def get_art_abs(lines):
    """ return as list of sentences"""

    # truncated trailing spaces, and normalize spaces
    lines = [' '.join(line.strip().split()) for line in lines]
    lines = [fix_missing_period(line) for line in lines]

    # Separate out article and abstract sentences
    article_lines = []
    highlights = []
    next_is_highlight = False
    for idx, line in enumerate(lines):
        if line == "":
            continue # empty line
        elif line.startswith("@highlight"):
            next_is_highlight = True
        elif next_is_highlight:
            highlights.append(line)
        else:
            article_lines.append(line)
    #print("Article:",article_lines)
    #print("Abstract:",highlights)
    return ' '.join(article_lines), ' '.join(highlights)


def write_json(i,article, abstract):
    """ Saves a json file."""
    #file = "C:/Users/ASUS/Downloads/gpt2_1024_data/"+str(i)+".json"
    js_example = {}
    js_example['id'] = i
    js_example['article'] = article
    js_example['abstract'] = abstract
    with open(str(i)+".json", 'w') as f:
        json.dump(js_example, f, ensure_ascii=False)
def main(file_names, directory):
    """ Reads txt files, extract articles and summaries, tokenize them and save as json files
        Args:
            file_names: list, all the articles with total no of tokens less than 1024
            directory: string, directory where files in file_names is stored
    """
    tokenizer = add_special_tokens()
    print("Execution Started...")
    train_ids = []
    file_id_map = {}
    i = 0
    for file in file_names[0:2000]:
        file = os.path.join(os.getcwd(),directory,file)
       
        with open(file,'r',encoding='utf-8') as f:
            s=f.read()
            print("Data Before Preprocessing:")
            print()
            print(s)
            s1=stemlem(s)
            lines = s1.split('\n\n')
            print("Data After Preprocessing:")
            print()
            print(lines)
        article, abstract = get_art_abs(lines)
        print("Separating the preprocessed data into Article and Abstract")
        print()
        print("Article:",article)
        print()
        print("Abstract:",abstract)
        article, abstract = tokenizer.encode(article), tokenizer.encode(abstract)
        if len(article)>0 and len(abstract)>0 and (len(article)+len(abstract))<=1023:
            train_ids.append(i)
            write_json(i,article,abstract)
            file_id_map[i] = os.path.basename(file).replace('.story', '')
            #print("ID ",i," ",file_id_map)
            i += 1
            if i%100==0:
                print(i, " files written")


    x,y = int(len(train_ids)*0.8), int(len(train_ids)*0.9)
    valid_ids = train_ids[x:y]
    test_ids = train_ids[y:]
    train_ids = train_ids[:x]
    with open("ids.json",'w') as f:
        js = {}
        js['train_ids'] = train_ids
        js['valid_ids'] = valid_ids
        js['test_ids'] = test_ids
        json.dump(js,f)

    # file_id_map maps the json file ids to actual cnn/dm file names ending with ".story"
    print("saving file_id_map...")

    with open("file_id_map.pickle", 'wb') as f:
        pickle.dump(file_id_map,f)
    print("file_id_map saved.")


if __name__ == '__main__':
    start = time.time()
    with open("C:/Users/ASUS/Downloads/DN/DN_file_size.pickle",'rb') as f:
        file_sizes = pickle.load(f)
    #print(file_sizes)
    file_names = [file for file,size in file_sizes.items() if size<=1023] #only consider files with total no of tokens less than 1024
    #if sys.argv[1].startswith("cnn"):
    directory = 'C:/Users/ASUS/Downloads/Dataset for NLP/dm_stories_tokenized/dm_stories_tokenized'
    #os.mkdir('C:/Users/ASUS/Downloads/DN/gpt2_1024_data')
    os.chdir('C:/Users/ASUS/Downloads/DN/gpt2_1024_data')
    """"else:
        directory = "dm_stories_tokenized"
        os.chdir('./DM/')"""
    main(file_names, directory)
    print("total_time_taken: ", (time.time()-start)/60, " minutes")

Execution Started...
Data Before Preprocessing:

Nicolas Dalby defends his welterweight title against Mohsen Bahari on Saturday - and you can watch it live with Mail Online .

As part of our exciting new partnership with Cage Warriors , we will bring you full coverage from London .

Watch all the action from 6pm .



@highlight

Nicolas Dalby takes on Mohsen Bahari at the Copper Box Arena in London

@highlight

Dalby 's welterweight title will be on the line at the Olympic venue

@highlight

Pannie Kianzad vs Eeva Siiskonen heads a packed undercard
Data After Preprocessing:

['nicolas dalby defends his welterweight title against mohsen bahari on saturday - and you can watch it live with mail online .', 'as part of our exciting new partnership with cage warriors , we will bring you full coverage from london .', 'watch all the action from 6pm .', '', '@highlight', 'nicolas dalby takes on mohsen bahari at the copper box arena in london', '@highlight', "dalby 's welterweight title will be 