In [26]:
import PyPDF2
import os
from constants import *
import string
import json
import re
from difflib import SequenceMatcher
from spacy_langdetect import LanguageDetector
from spacy.language import Language
import spacy

### Article Cleaning

Note: the sample cleanedArticles and rawArticles is only the first few article folders & the first 10 valid articles within those folders (I didnt wanna clog the repo but wanted to show the output)


In [27]:
# Check to see if the word "abstract" appears before the word "introduction" in an article
def isValidOrder(text: str):
    abstractFound = False
    for i in range(len(text)):
        if not abstractFound and 'abstract' in text[0:i]:
            abstractFound = True
        if 'introduction' in text[0:i]:
            if abstractFound:
                return True
            return False
        # references showing up before "abstract" and "introduction"
        if 'references' in text[0:i]:
            return False

In [None]:
def isEnglish(sentence: str):
    nlp = spacy.load('en')
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
    doc = nlp(sentence)
    detect_language = doc._.language
    if detect_language["language"] != 'en':
        return False
    return True

In [28]:

rootdir = 'articles'
validArticles = {}
for subdir, dirs, files in os.walk(rootdir):
    subdir = subdir.replace('\\', '')
    subdir = subdir.replace(f'{rootdir}', '')
    print('reading folder: ' + subdir)
    counter = 0
    for file in files:
        # im only taking the first 10 valid articles for each folder here so I can show input and output on github
        if counter == 10:
            break
        with open(f'{rootdir}/{subdir}/{file}', 'r', encoding='cp1252', errors='ignore') as f:
            text = " ".join(f.readlines())
            # only look for articles with "abstract", "introduction", and "references" to make cleaning possible
            if 'abstract' in text.lower() and 'introduction' in text.lower() and 'references' in text.lower():
                # need to make sure the show up in the right order
                if isValidOrder(text.lower()):
                    validArticles[file] = text 
                    counter += 1

reading folder: 
reading folder: 0704
reading folder: 0705
reading folder: 0706
reading folder: 0707
reading folder: 0708
reading folder: 0709
reading folder: 0710
reading folder: 0711
reading folder: 0712
reading folder: 0801
reading folder: 0802
reading folder: 0803
reading folder: 0804
reading folder: 0805
reading folder: 0806
reading folder: 0807
reading folder: 0808
reading folder: 0809
reading folder: 0810
reading folder: 0811
reading folder: 0812
reading folder: 0901
reading folder: 0902
reading folder: 0903


In [29]:
def cleanArticle(text: str):
    cleanedText = ""
    startingIndex = text.lower().find('introduction') + len('introduction')
    endingIndex = text.lower().rfind('references')
    # only read from the article's introduction section to its references section 
    text = text[startingIndex:endingIndex]
    # get rid of numbers, keep punctuation
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
    #     The \( and \) say we want to target the actual parenthesis.
    #     Then the parenthesis around the expression (.*?) say that we want to group what is inside
    #     Finally the .*? means that we want any character . and any repetition of that character *?.
    text = re.sub(r"\((.*?)\)", "", text)
    # remove extra spaces
    cleanedText = re.sub(' +', ' ', cleanedText)
    return cleanedText

In [30]:
    articleIdentifiers = set()
    for articleName in validArticles:
        # make a folder path to match the input articles folder structure, if one doesn't already exist
        if articleName[0:4] not in articleIdentifiers and not os.path.exists(f'cleanedArticles/{articleName[0:4]}') and not os.pathexists(f'rawArticles/{articleName[0:4]}'):
            articleIdentifiers.add(articleName[0:4])
            os.mkdir(f'cleanedArticles/{articleName[0:4]}')
            os.mkdir(f'rawArticles/{articleName[0:4]}')

        # writing raw articles to files (only doing this here because im taking the first 10 valid articles)
        text = validArticles[articleName]
        text_file = open(f"rawArticles/{articleName[0:4]}/{articleName}", "w")
        text_file.write(text)
        text_file.close()
        
        # writing cleaned articles to files
        cleanedText = cleanArticle(text)  
        cleaned_text_file = open(f"cleanedArticles/{articleName[0:4]}/{articleName}", "w", encoding='utf-8')
        cleaned_text_file.write(cleanedText)
        cleaned_text_file.close()
