# Clean Data

Scrub text and remove stopwords.  
Generates a new text_files_clean.

-  v0.1 2016-10-19 split from new_dfr_project v.05 2016-10-19
-  v0.2 2016-10-24 added metadata_dfrb.ipynb
-  v0.3 2016-10-26 revised corpus_compare.py and added duplicate deleter step
-  v0.4 2017-10-30 switch to Python 3
-  v0.5 2017-10-31 merge metadata_dfrb.ipynb into notebook
-  v0.6 2017-10-31 relocate all caches
-  v0.7 2017-10-31 global settings file


In [None]:
## INFO

__author__    = 'Jeremy Douglass'
__copyright__ = 'copyright 2016, The WE1S Project'
__license__   = 'GPL'
__version__   = '0.6'
__email__     = 'jeremydouglass@gmail.com'


In [None]:
## IMPORT


In [None]:
## SETTINGS

## project directory
project_dir    = %pwd
print(project_dir)

## import global project settings from config.py
from settings import *


## SCRUB TEXT

In [None]:
## SCRUB TEXT

# scrubs files from caches/text_files/
# and puts output in caches/text_files_clean/
# ... as per setting variables in scripts/scrub/config.py

!mkdir -p {scrub_dir}
!mkdir -p {text_files_clean_dir}

%cd  $scrub_dir
%run $scrub
%cd  $project_dir


In [None]:
## --------------
## DOCKERFILE ADD
## --------------
# ImportError: No module named ftfy
# Fix: add to Dockerfile:
#   pip install ftfy
#   pip2 install ftfy
#   pip3 install ftfy

## DE-DUPLICATE

In [None]:
## DE-DUPLICATE

## For help on script options:
## %run scripts/deduplicate/corpus_compare.py -h 

## delete previous results
!rm -f {dedup_dir}/{dedup_output}.csv
!rm -f {dedup_dir}/{dedup_output}.log
!rm -f {dedup_output}.log

!mkdir -p {text_files_clean_dir}
%run {dedup_dir}/{dedup} -i {text_files_clean_dir}/ -o {dedup_dir}/{dedup_name}.csv -l {dedup_dir}/{dedup_name}.log

## --------------
## FOR DockerFile
## --------------
## relies on sklearn
## need to pip install or pip2 install or conda install scikit-learn?


In [None]:
## MINOR BUG
## ---
## corpus_compare.py is printing output multiple times if it is re-run in the notebook.
## This has to do with setting the parser root handler -- do it, and warnings replicate, don't and no notebook output.
## Output appears as an error message in a notebook and interrupts batch flow.
## This also interrupts %run in a group notebook.
##
## ...fixed the double-notebook output by setting root logger to info and `logger.propagate = 0`
## Tried removing all logger handlers at end of script to no effect.
## However, .log text file is still written to twice, and notebook output is still pink/warning.
## Tried moving all logger console calls from .debug to .info -- notebook output still pink.

## in the future, could also try:
## %capture [--no-stderr] [--no-stdout] [--no-display] [output]

## DELETE DUPLICATES

In [None]:
## MERGE METADATA
import os
import csv
with open(project_dir + '/' + dedup_dir + '/' + dedup_name + '.csv','r') as fin:
    cfin = csv.reader(fin)
    # print(cfin, None)
    next(cfin) # skip header
    for row in cfin:
        if os.path.isfile(row[5]):
            print('Deleting: ' + row[5])
            os.remove(row[5])
        else:
            print('Missing:  '+ row[5])

print('\n-----\nDuplicates deleted from:', dedup_dir + '/' + dedup_name + '.csv')


## RE-ORDER METADATA

In [None]:
import csv

csv_in  = metadata_file
csv_out = metadata_file_reorder

## re-order column names

## infieldnames provides names for the original column order
infieldnames = 'id', 'journaltitle', 'pubdate', 'title', 'pagerange', 'author', 'volume', 'issue'
## outfieldnames re-orders that name list into a new column order
outfieldnames = 'id', 'title', 'author', 'journaltitle', 'volume', 'issue', 'pubdate', 'pagerange'

## create reordered metadata file

with open(csv_in, 'r') as infile, open(csv_out, 'a') as outfile:
    ## input dict needs a list for column renaming
    reader = csv.DictReader(infile, fieldnames=infieldnames)
    ## skip outdated header row
    next(reader, None)

    ## output dict needs a reordered list for new column ordering
    writer = csv.DictWriter(outfile, fieldnames=outfieldnames)
    ## write automatic header
    writer.writeheader()
    
    ## write each row to new file with remapped column order
    for row in reader:
        writer.writerow(row)

print('\n-----\nReordered metadata to:', csv_out)

## NEXT

In [None]:
## NEXT
## Generate a link to the next notebook in the workflow

from IPython.display import display, HTML
browser_link_html = HTML('<p>The data in ~/caches/text_files_clean/ is clean.</p><h2><a href="3_make_topic_model.ipynb" target="_blank">Next: Make the topic model.</h2>')
display(browser_link_html)


----------