In [29]:
# This function reads column structured text
# Input: File handle
# Output: Dictionary 

def read_column_structured(f):
    
    corpus={}
    unique_id = 'UT'
    
    for index, line in enumerate(f):
        line = line.rstrip('\n')
        fields = line.split('\t')
            
        if (index == 0):
            headers = fields
            endindex = fields.index(unique_id)
        else:
            article_id = fields[endindex]
            pair = zip(headers,fields)
            record = dict(pair)
            corpus[article_id]=record
            record = {}
            
    return corpus

In [3]:
# This function reads row structured text
# Input: File handle
# Output: Dictionary 

def read_row_structured(f):
    
    record={}
    corpus={}

    end_of_record = 'ER'
    unique_id = 'UT'
    
    for line in f:
        line = line.rstrip()

        tag = line[0:2]
        content = line.lstrip(tag+" ")

        if (tag == "  "):
            tag = lasttag
        lasttag = tag
            
        if tag in record:
            record[tag]=record[tag]+"; "+content
        else:
            record[tag]= content
            
        if (tag == unique_id):
            article_id = content
            
        if (tag == end_of_record):
            corpus[article_id]=record
            record = {}
                     
    return corpus

In [38]:
import json
from os import walk


def parse_files(inputfd,outputfile,parsedir=True,fileformat="column"):
    """Parse files or directories. 

    Keyword arguments:
    parsedir -- Parse a directory, otherwise parse a file (default True)
    inputfd -- The file name or directory
    fileformat -- The file format. Supported formats include column (default) and row
    outputfile -- The desired output path
    
    """
    # initialization
    allpaths = []
    corpus = {}
    
    # Find all files in directory 
    if (parsedir == True):
        allfiles = []
        for (dirpath, dirnames, filenames) in walk(inputfd):
            allfiles.extend(filenames)
            break
        for file in allfiles:
            newpath = inputfd+"/"+file
            allpaths.append(newpath)
                
    else:
        allpaths=[inputfd]

    # Process each file 
    for inputfile in allpaths:
        print("Opening file ", inputfile)
       
        with open(inputfile, 'r', encoding = 'utf-16') as f:
            if (fileformat == "column"):
                newcorpus = read_column_structured(f)
            elif (fileformat == "row"):
                newcorpus = read_row_structured(f)
            else:
                print("Format not understood")
            corpus.update(newcorpus)
            print("The total number of records is now:",len(corpus))
            
    with open(outputfile, 'w') as g:
        json.dump(corpus,g)

In [145]:
# try parsing a full directory
inputdir = "Data/nano/articles/column"
outputfile = "Output/nano/nano_articles_column_JSON1.txt"

print("Trying a full directory")
parse_files(inputdir,outputfile)
print()

# try parsing a single file
inputdir = "Data/nano/articles/column/article_nano_column1.txt"
outputfile = "Output/nano/nano_articles_column_JSON2.txt"
parsdir = False

print("Trying a single file")
parse_files(inputdir,outputfile,parsdir)
print()

# try parsing a full directory of row formatted files
inputdir = "Data/nano/patents/row"
outputfile = "Output/nano/nano_patents_row_JSON1.txt"
parsdir = True
fileformat = "row"

print("Trying a full directory of row formatted files")
parse_files(inputdir,outputfile,parsdir,fileformat)
print()

# try parsing a single row formatted file 
inputdir = "Data/nano/patents/patent_nano_row1.txt"
outputfile = "Output/nano/nano_patents_row_JSON2.txt"
parsdir = False
fileformat = "row"

print("Trying a single row-formatted file")
parse_files(inputdir,outputfile,parsdir,fileformat)
print()



Trying a full directory
Opening file  Data/nano/articles/column/article_nano_column1.txt
Opening file  Data/nano/articles/column/article_nano_column2.txt
Opening file  Data/nano/articles/column/article_nano_column3.txt
Opening file  Data/nano/articles/column/article_nano_column4.txt
Opening file  Data/nano/articles/column/article_nano_column5.txt

Trying a single file
Opening file  Data/nano/articles/column/article_nano_column1.txt

Trying a full directory of row formatted files
Opening file  Data/nano/patents/row/patent_nano_row1.txt
Opening file  Data/nano/patents/row/patent_nano_row2.txt
Opening file  Data/nano/patents/row/patent_nano_row3.txt
Opening file  Data/nano/patents/row/patent_nano_row4.txt
Opening file  Data/nano/patents/row/patent_nano_row5.txt

Trying a single row-formatted file
Opening file  Data/nano/patents/patent_nano_row1.txt



In [43]:
# try parsing a full directory
inputdir = "Data/nano/fullset"
outputfile = "Output/nano/full_nano_JSON.txt"

print("Trying a full directory")
parse_files(inputdir,outputfile)
print()

Trying a full directory
Opening file  Data/nano/fullset/nano1.txt
The total number of records is now: 500
Opening file  Data/nano/fullset/nano10.txt
The total number of records is now: 1000
Opening file  Data/nano/fullset/nano11.txt
The total number of records is now: 1500
Opening file  Data/nano/fullset/nano12.txt
The total number of records is now: 2000
Opening file  Data/nano/fullset/nano13.txt
The total number of records is now: 2500
Opening file  Data/nano/fullset/nano14.txt
The total number of records is now: 3000
Opening file  Data/nano/fullset/nano15.txt
The total number of records is now: 3500
Opening file  Data/nano/fullset/nano16.txt
The total number of records is now: 4000
Opening file  Data/nano/fullset/nano17.txt
The total number of records is now: 4500
Opening file  Data/nano/fullset/nano18.txt
The total number of records is now: 4946
Opening file  Data/nano/fullset/nano19.txt
The total number of records is now: 5446
Opening file  Data/nano/fullset/nano2.txt
The total nu

In [27]:
# try parsing a single file
inputdir = "Data/nano/fullset/nano1.txt"
outputfile = "Output/nano/nano_articles_column_JSON1.txt"
parsdir = False

print("Trying a single file")
parse_files(inputdir,outputfile,parsdir)
print()

Trying a single file
Opening file  Data/nano/fullset/nano1.txt
['PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'RP', 'EM', 'RI', 'OI', 'FU', 'FX', 'CR', 'NR', 'TC', 'Z9', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', 'MA', 'BP', 'EP', 'AR', 'DI', 'D2', 'PG', 'WC', 'SC', 'GA', 'UT', 'PM']

