In [None]:
## INFO

__author__    = 'Jeremy Douglass'
__copyright__ = 'copyright 2017, The WE1S Project'
__license__   = 'GPL'
__version__   = '0.6'
__email__     = 'jeremydouglass@gmail.com'


In [None]:
## IMPORT

import csv
import glob
import os
import shutil


In [None]:
## Choose a datafolder and file from 

jsondatadir = '/home/jovyan/work/write/data/'
datafile_list = ['2017-01-humanities/team1_humanities_month.zip',
                 '158208_koreaherald_humanities.zip',
                ]

In [None]:
## SETTINGS

## project directory
project_dir = %pwd
print(project_dir)

## import global project settings from config.py
from settings import *

## RUN

In [None]:
for datafile in datafile_list:
    datapath = jsondatadir + datafile
    !mkdir -p caches/json
    !unzip -j -o -u "{datapath}" -d caches/json

In [None]:
## DEFINE METADATA STRINGCLEANER

import string
import unidecode

def string_cleaner(unistr):
    """Returns string in unaccented form, printable characters only."""
    unaccented = unidecode.unidecode(unistr)
    printonly = ''.join(filter(lambda x:x in string.printable, unaccented))
    return printonly

In [None]:
## CREATE METADATA FROM JSON FILES

import json

## Delete old metadata files
!rm -fr {metadata_dir}
!mkdir -p {metadata_dir}

json_directory = 'caches/json/'

## id, publication, pubdate, title, articlebody, author, docUrl, wordcount

## idx       ->  id
## pub       ->  publication
## pub_date  ->  pubdate
## title     ->  title
## content   ->  articlebody

##           ->  author
##           ->  docUrl
## length    ->  wordcount

csv.field_size_limit(100000000)

with open('caches/json-metadata.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
    csvwriter.writerow(['id'] + ['publication'] + ['pubdate'] + ['title'] + ['articlebody'] + ['author'] + ['docUrl'] + ['wordcount'])
    for idx, filename in enumerate(os.listdir(json_directory)):
        if filename.endswith(".json"):
            print(filename)
            with open(os.path.join(json_directory, filename)) as f:
                j = json.loads(f.read())
                print(idx)
                print(j['pub'])
                csvwriter.writerow([idx] + [j['pub']] + [j['pub_date']] + [j['title']] + [string_cleaner(j['content'])])

                metadata_csv_files = ['caches/json-metadata.csv']

metadata_csv_files = ['caches/json-metadata.csv']

In [None]:
## Copy metadata file list
metadata_out = project_dir + '/' + metadata_dir + '/'
for f in metadata_csv_files:
    shutil.copy(f, metadata_out)
!ls -1 {metadata_out}

In [None]:
## CHECK METADATA

!echo CHECK METADATA
!echo
!echo {metadata_dir} :
!echo
!ls -1 {metadata_dir}

In [None]:
## MERGE METADATA

## Delete old merged metadata
print(metadata_dir, ': \n')

!rm -f {metadata_file}

with open(metadata_file, 'w') as fout:
    # copy header from first file
    headerfile = open(metadata_csv_files[0])
    fout.write(headerfile.readline())
    ## copy bodies
    wout = csv.writer(fout)
    for filename in metadata_csv_files:
        print('Processing', filename)
        with open(filename, 'r') as fin:
            win = csv.reader(fin)
            next(win) # skip header
            wout.writerows(win)


In [None]:
## CHECK MERGE

!echo CHECK MERGE
!echo
!echo {metadata_file} :
!echo
!head -n 2 {metadata_file}

In [None]:
## COPY TEXT
## Replaced by the metadata csv export
# for file in glob.glob(text_file_directory+r'/*.txt'):
#    shutil.copy(file, project_directory+'/text_files/')

In [None]:
## EXPORT ARTICLE BODIES TO TEXT FILES

## Source CSV fields:
## id, publication, pubdate, title, articlebody, author, docUrl, wordcount

## Delete old text files
!rm -fr {text_files_dir}
!mkdir -p {text_files_dir}

## Export
with open(metadata_file, 'r') as infile:
    reader = csv.DictReader(infile)
    ## skip header row
    # next(reader, None)
    for row in reader:
        with open(project_dir+'/' + text_files_dir + '/'+ row['id'] + '_.txt', 'w') as outfile:
            # writer = csv.DictWriter(outfile, fieldnames=outfieldnames)
            outfile.write(row['articlebody'])

In [None]:
## CHECK TEXT FILES

!echo CHECK TEXT FILES
!echo
!echo {text_files_dir} :
!echo
!ls -1 {text_files_dir}

## NEXT

In [None]:
## NEXT
## Generate a link to the next notebook in the workflow

from IPython.display import display, HTML
browser_link_html = HTML('<p>The data is imported into ~/metadata/ and ~/text_files/.</p><h2><a href="2_clean_data.ipynb" target="_blank">Next: Clean Data.</h2>')
display(browser_link_html)


----------