In [3]:
def fn_preprocess_file(file_path):

    def fn_process_text(text):
        import re

        text = re.sub('[^a-zA-Z0-9\n]', ' ', text) #--- Replace every special char with space
        text = re.sub('\s+', ' ', text).strip()  #----- Replace excess whitespaces

        return text


    def fn_simplify_txt(listO_sentences):
    
        l = []

        for line in listO_sentences:
            line = fn_process_text(line) 
            if len(line) > 1:
                l.append(line + '. ')

        return l
    
    
    def fn_consolidate_txt(listO_sentences):

        s = ''
        for line in listO_sentences:
            s += line
        return s
    

    def fn_segregate_news_highlights(file_path, encoding = 'cp1252'):

        listO_news, listO_highlights = [], []
        highlights = False

        with open(file_path, encoding = encoding, errors = 'replace') as f:

                for line in f:
                    if highlights == True:
                        if '@highlight' not in line:
                            listO_highlights.append(line)
                        continue
                    if '@highlight' in line:
                        highlights = True
                    else:
                        listO_news.append(line)

        return listO_news, listO_highlights
    
    

    listO_news, listO_highlights = fn_segregate_news_highlights(file_path)    
    
    listO_news = fn_simplify_txt(listO_news)
    listO_highlights = fn_simplify_txt(listO_highlights) 
    
    news = fn_consolidate_txt(listO_news)
    highlights = fn_consolidate_txt(listO_highlights)
    
    return news, highlights


def fn_parse_folder(folder_path):
    import glob, re
    from  progressbar import ProgressBar    
    
    listO_files = []
    for file in glob.glob(folder_path + '*.story'):
        listO_files.append(file)

    pattern = re.compile(r"\\")
    pattern.sub("/", file)
    listO_files = [pattern.sub("/", file) for file in listO_files]
    pattern = re.compile(r"\\")
    pattern.sub("/", file)
    listO_files = [pattern.sub("/", file) for file in listO_files]
    
    pbar = ProgressBar(max_value=len(listO_files))
    dictO_news, dictO_highlights = {}, {}
    for idx, path in pbar(enumerate(listO_files)):

        news, highlights = fn_preprocess_file(path)
        dictO_news[idx], dictO_highlights[idx] = news, highlights
        pbar.update(idx)
        
    return dictO_news, dictO_highlights

In [5]:
folder_path = 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/stories/'

dictO_news, dictO_highlights = fn_parse_folder(folder_path)

100% (92579 of 92579) |##################| Elapsed Time: 0:03:41 Time:  0:03:41


In [6]:
len(dictO_news), len(dictO_highlights)

(92579, 92579)

In [7]:
dictO_news[22]

'CNN Criminals who file fraudulent tax returns by stealing people s identities could rake in an estimated 26 billion over the next five years because the IRS cannot keep up with the amount of the fraud Treasury Inspector General J Russell George said Tuesday. Our analysis found that although the IRS detects and prevents a large number of fraudulent refunds based on false income documents there is much fraud that it does not detect said George s prepared testimony before a joint hearing of the House Ways and Means Subcommittees on Oversight and Social Security. George s report is the first detailed analysis of the tax refund fraud problem which could affect any legitimate taxpayer His projection of 26 billion is larger than any other estimate of identity theft tax fraud. In a statement issued following George s testimony the IRS said it believes that the five year estimate is far too high. The estimate was based on 2010 figures which took place before the IRS instituted major changes wi

In [8]:
dictO_highlights[22]

'The Treasury s estimate is the first detailed analysis of the ongoing problem. With budget cuts the IRS cannot deal with the fraud according to inspector general. IRS says it stopped the issuance of 1 3 billion in potentially fraudulent tax returns. '

In [9]:
file_path = 'hehe.txt'

news, highlights = fn_preprocess_file(file_path)

In [10]:
news

'CNN For the second time during his papacy Pope Francis has announced a new group of bishops and archbishops set to become cardinals and they come from all over the world. Pope Francis said Sunday that he would hold a meeting of cardinals on February 14 during which I will name 15 new Cardinals who coming from 13 countries from every continent manifest the indissoluble links between the Church of Rome and the particular Churches present in the world according to Vatican Radio. New cardinals are always important because they set the tone in the church and also elect the next pope CNN Senior Vatican Analyst John L Allen said They are sometimes referred to as the princes of the Catholic Church. The new cardinals come from countries such as Ethiopia New Zealand and Myanmar. This is a pope who very much wants to reach out to people on the margins and you clearly see that in this set Allen said You re talking about cardinals from typically overlooked places like Cape Verde the Pacific island

In [4]:
highlights

'The 15 new cardinals will be installed on February 14. They come from countries such as Myanmar and Tonga. No Americans made the list this time or the previous time in Francis papacy. '

In [83]:
folder_path = 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/'
listO_files = []
for file in glob.glob(folder_path + '*.story'):
    listO_files.append(file)
                      
pattern = re.compile(r"\\")
pattern.sub("/", file)
listO_files = [pattern.sub("/", file) for file in listO_files]
listO_files                    

['C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/000c835555db62e319854d9f8912061cdca1893e.story',
 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/000ca3fc9d877f8d4bb2ebd1d6858c69be571fd8.story',
 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/000cd1ee0098c4d510a03ddc97d11764448ebac2.story',
 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/000e009f6b1d954d827c9a550f3f24a5474ee82b.story',
 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/00a2aef1e18d125960da51e167a3d22ed8416c09.story',
 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/00a340f3a884fcbdad7c0399782d9ca9d2d68ef7.story',
 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/00a39c134080b6f215a81c15d46c3ac7cc7bdcf3.story',
 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/00a51d5454f2ef7dbf4c53471223a27fb9c20681.story',
 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/00a57bef588a7e7efd0438f

In [97]:
pattern = re.compile(r"\\")
pattern.sub("/", file)
listO_files = [pattern.sub("/", file) for file in listO_files]

dictO_news, dictO_highlights = {}, {}
for idx, path in enumerate(listO_files):

    news, highlights = fn_preprocess_file(path)
    dictO_news[idx], dictO_highlights[idx] = news, highlights

In [98]:
dictO_news[3]

'CNN Kyle White now has two pieces of metal to wear one a bracelet inscribed with the names of his six comrades killed in an ambush in Afghanistan the other a Medal of Honor given to him for his valor that ensured that death toll wasn t higher. Speaking minutes after President Barack Obama gave him the highest military honor White insisted the two emblems are equally significant They both represent his family on that day six years ago the seven others who like him survived as well as those who did not. The former Army sergeant said Tuesday he owes it to these men whom he calls my heroes to live his life well even now that he s left the military and with honor. Though I am still uncomfortable with hearing my name and the word hero in the same sentence I am now ready for the challenge of proudly wearing this piece of blue fabric and carved metal with the same reverence that I wear the bracelet And I vow to live up to the responsibility of doing so White said. Not long before Obama recall

In [99]:
dictO_highlights[3]

'NEW Kyle White Without this team there would be no Medal of Honor. NEW He vows to live up to the responsibility of having the top military award. NEW Obama calls White a soldier who embodies the courage of his generation. The Army vet then 20 braved enemy fire to save his wounded comrades in Afghanistan. '

In [101]:
def fn_parse_folder(folder_path):
    import glob, re
    
    listO_files = []
    for file in glob.glob(folder_path + '*.story'):
        listO_files.append(file)

    pattern = re.compile(r"\\")
    pattern.sub("/", file)
    listO_files = [pattern.sub("/", file) for file in listO_files]
    pattern = re.compile(r"\\")
    pattern.sub("/", file)
    listO_files = [pattern.sub("/", file) for file in listO_files]

    dictO_news, dictO_highlights = {}, {}
    for idx, path in enumerate(listO_files):

        news, highlights = fn_preprocess_file(path)
        dictO_news[idx], dictO_highlights[idx] = news, highlights
        
    return dictO_news, dictO_highlights

In [102]:
folder_path = 'C:/Users/wocsa/Documents/Python_Scripts/CNN_NEWS_PROJECT/trial/'

dictO_news, dictO_highlights = fn_parse_folder(folder_path)

In [103]:
dictO_highlights[3]

'NEW Kyle White Without this team there would be no Medal of Honor. NEW He vows to live up to the responsibility of having the top military award. NEW Obama calls White a soldier who embodies the courage of his generation. The Army vet then 20 braved enemy fire to save his wounded comrades in Afghanistan. '