In [3]:
import numpy   as np
import pandas  as pd

import warnings
warnings.filterwarnings('ignore')

# For no-print display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  #default 'last_expr'

## Highlights
<a href='#e1'> Handle JSON files from Corona Virus Open Research Dataset

# <a id='0'>Data Formats</a>
-- How to READ various formats of Data into a Pandas DataFrame (DF)? <br>
-- How to WRITE the DataFrame into a specific data format?
- <a href='#1'>Text Files</a>    
    - <a href='#11'> CSV
    - <a href='#12'> JSON, XML, HTML
- <a href='#2'>Binary Files</a>  
    - Pickle, HDF5, Excel    
- <a href='#3'>Using Web APIs</a>
- <a href='#4'>From Databases</a>
    
Note: this notebook is created based on Chapter 6 of Wes McKinney's "Python for Data Analysis" 2nd edition. I simplified and clarified some of the examples that I found difficult when I learned them, and hope this could help others.<br>

## <a id='1'> Text Files

### <a id='11'> CSV
Common problems :
- a dataset with no headers
- irregular delimiters
- irrelevant comment lines
- reading a BIG file in chunks

#### Name the columns of a dataset that has no header

In [5]:
filename = 'examples/ex2.csv'
!cat $filename
labels = ['a', 'b', 'c', 'd', 'message']

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [6]:
pd.read_csv(filename, header=None, names=labels)

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### Read multi-space separated text using regular expression

In [7]:
filename = 'examples/ex3.txt'
!cat $filename

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


In [8]:
pd.read_csv(filename, sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


#### Skip comment lines in the file

In [9]:
filename = 'examples/ex4.csv'
!cat $filename

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [10]:
pd.read_csv(filename, skiprows=[0, 2, 3]) #io

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### Read Large Text Files in Chunks (Big Data)

In [17]:
filename = 'examples/ex6.csv'
# Read in first n rows
pd.read_csv(filename, nrows=3)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G


##### Count the occurrences of each category in column 'key', but read only one chunk of data at a time

In [19]:
key_counts = pd.Series([])
generator_chunks = pd.read_csv(filename, chunksize=1000)
for chunk in generator_chunks:
    key_counts = key_counts.add(chunk['key'].value_counts(), fill_value=0) # avoid NA values
key_counts.sort_values(ascending=False)[:5]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
dtype: float64

#### Write Data to CSV Format

In [20]:
# Store index and header by default
df = pd.read_csv('examples/ex5.csv')
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [22]:
file = 'examples/out.csv'
df.to_csv(file, index=False, header=True)
!cat $file

something,a,b,c,d,message
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


### <a id='12'> JSON, HTML, XML
More flexible than CSV, supporting hierarchical, nested data.
- <a href='#0'> Back to TOC

#### JSON 
##### Read JSON (into a dataframe)
Two typical formats: list-type, line-type, each line as a single record (a row of data).

In [1]:
!cat examples/example.json

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]


In [7]:
pd.read_json('examples/example.json')

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [8]:
!cat examples/example1.json  # no commas, no brackets, one dict per line

{"a": 1, "b": 2, "c": 3}
{"a": 4, "b": 5, "c": 6}
{"a": 7, "b": 8, "c": "a" }


In [9]:
pd.read_json('examples/example1.json', lines=True)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,a


#### <a id='e1'>Example: Handle JSON files from Corona Virus Open Research Dataset
- **Background** <br>
COVID-19 Open Research Dataset is prepared by a coalition of leading research groups assisted by the White House, in response to the Pandemic in 2020. There are over 29,000 full-text research articles, saved in JSON files. Each file contains a single article in a dictionary format. The first challenge is to import these texts into a dataframe, ready for further text mining.
    
    
<a href='#0'> Back to TOC

In [12]:
!pwd
!ls examples/covid19/

/Users/William/Downloads/Backup/3-21-2019/ML/ML_ipynb/Py_Data_Analysis-2nd-edition
[1m[31m0a32446730827ad8152c6a61e4738e4e0b231412.json[m[m
[1m[31m0acc1f9a1c333a9a6b2dbba4a252d7576f024783.json[m[m
[1m[31m0d7964c665ff0107c674bc6dab8f252cab08038e.json[m[m
[1m[31m0da2ec30d7dfdef624833a36890f0297f19d09ec.json[m[m
[1m[31m0e2ba405f636e06821999876cd82c00e26404b43.json[m[m


In [13]:
import os
filenames = os.listdir('./examples/covid19')
filenames

['0a32446730827ad8152c6a61e4738e4e0b231412.json',
 '0acc1f9a1c333a9a6b2dbba4a252d7576f024783.json',
 '0d7964c665ff0107c674bc6dab8f252cab08038e.json',
 '0da2ec30d7dfdef624833a36890f0297f19d09ec.json',
 '0e2ba405f636e06821999876cd82c00e26404b43.json']

In [19]:
!head -10 examples/covid19/0a32446730827ad8152c6a61e4738e4e0b231412.json

{
    "paper_id": "0a32446730827ad8152c6a61e4738e4e0b231412",
    "metadata": {
        "title": "Artesunate interacts with Vitamin D receptor to reverse mouse model of sepsis-induced immunosuppression via enhancing autophagy Short running title: Artesunate reverses sepsis induced immunosuppression",
        "authors": [
            {
                "first": "Shenglan",
                "middle": [],
                "last": "Shang",
                "suffix": "",


We can't use read_json( ) here since each file has only one record and it's on multiple lines. read_json( ) only works if this record is on a single line.

In [25]:
import json
j = json.load(open('./examples/covid19/'+filenames[0]))
j.keys()

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])

In [30]:
j['metadata'].keys()

dict_keys(['title', 'authors'])

Our next goal is to group the following text sections altogether: 'title', 'abstract', 'body_text', in order for further analyses.

In [36]:
for file in os.listdir('./examples/covid19/'):
    file_path = './examples/covid19/'+file
    j = json.load(open(file_path))
    
    title = j['metadata']['title']    
    abstract = j['abstract'][0]['text']

IndexError: list index out of range

Handle the exception when abstract is empty.

In [38]:
docs = []
for file in os.listdir('./examples/covid19/'):
    file_path = './examples/covid19/'+file
    j = json.load(open(file_path))
    
    title = j['metadata']['title']    
    try:
        abstract = j['abstract'][0]['text']
    except IndexError:
        abstract = ""
    
    fulltext = ""
    for text in j['body_text']: # body_text is a list of texts
        fulltext += text['text'] + '\n\n'
    docs.append([title, abstract, fulltext])
df = pd.DataFrame(docs, columns=['title', 'abstract','fulltext'])
df.head()

Unnamed: 0,title,abstract,fulltext
0,Artesunate interacts with Vitamin D receptor t...,,Sepsis is a leading cause of death worldwide (...
1,SKEMPI 2.0: An updated benchmark of changes in...,Motivation: Understanding the relationship bet...,Protein-protein interactions are central to al...
2,A transmissible RNA pathway in honey bees,"Honey bees are eusocial insects, living in a c...","In eukaryotes, sequence-specific gene silencin..."
3,In trans variant calling reveals enrichment fo...,Compound heterozygotes occur when different mu...,Using the premise that effective variants are ...
4,Identification of a Nidovirales Orf1a N7-guani...,Members of the Nidovirales order have (+)RNA g...,"regarding their RNA capping pathway, it only s..."


##### Write JSON
Two typical formats: 
- by-row: a list of dictionaries.
- by-column: a dictionary of lists.

In [20]:
df = pd.read_json('examples/example.json')

file = 'examples/test.json'
df.to_json(file, orient='records') # row format [{row1},     {row2},    ...], smaller in size
!cat $file
print()
df.to_json(file)                  # col format  {col1:[..], col2:[..], ...}
!cat $file

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]
{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}

#### HTML
Extract all the tables from a HTML file or a URL

In [8]:
tables = pd.read_html('examples/fdic_failed_bank_list.html') # to a list of DFs
type(tables), len(tables)

df = tables[0]
df.head(3)

(list, 1)

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"


In [9]:
url  = 'https://www.w3schools.com/tags/tag_table.asp'
tables = pd.read_html(url, header=0)  # need to specifiy the first line as the header
df = tables[1]
df.head(3)

Unnamed: 0,Attribute,Value,Description
0,align,left center right,Not supported in HTML5. Specifies the alignme...
1,bgcolor,"rgb(x,x,x) #xxxxxx colorname",Not supported in HTML5. Specifies the backgro...
2,border,10,Not supported in HTML5. Specifies whether or ...


#### XML: ft. lxml

In [12]:
from lxml import objectify

file = 'examples/Performance_MNR.xml'
root = objectify.parse(open(file)).getroot()

data = []

# loop through all INDICATOR fields, each INDICATOR field contains one row of data
for row in root.INDICATOR:
    row_data = {}
    for col in row.getchildren():
        row_data[col.tag] = col.pyval
    data.append(row_data)
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,AGENCY_NAME,CATEGORY,DECIMAL_PLACES,DESCRIPTION,DESIRED_CHANGE,FREQUENCY,INDICATOR_NAME,INDICATOR_SEQ,INDICATOR_UNIT,MONTHLY_ACTUAL,MONTHLY_TARGET,PARENT_SEQ,PERIOD_MONTH,PERIOD_YEAR,YTD_ACTUAL,YTD_TARGET
0,Metro-North Railroad,Service Indicators,1,Percent of commuter trains that arrive at thei...,U,M,On-Time Performance (West of Hudson),28445,%,96.9,95,,1,2008,96.9,95
1,Metro-North Railroad,Service Indicators,1,Percent of commuter trains that arrive at thei...,U,M,On-Time Performance (West of Hudson),28445,%,95.0,95,,2,2008,96.0,95
2,Metro-North Railroad,Service Indicators,1,Percent of commuter trains that arrive at thei...,U,M,On-Time Performance (West of Hudson),28445,%,96.9,95,,3,2008,96.3,95


## <a id='2'> Binary Files
- <a href='#0'> Back to TOC

### Pickle Format
Not limited to dataframes, but any Pandas objects

In [14]:
## pickle: short-term storage, error-prone if pickle versions differ
df = pd.read_csv('examples/ex1.csv')

df.to_pickle('examples/frame_pickle')
pd.read_pickle('examples/frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### HDF5 Format
HDF: hierarchical data format <br>
Q. How to choose: HDF vs Text files vs Databases?<br>
A. Text files can be large in size and slow when read. Databases are good for frequent writing but not for frequent reading. Therefore, when you deal with large data with about memory-size, and need frequent reading part of or the whole data, HDF5 may satisfy your need.

In [16]:
np.random.seed(42)
df = pd.DataFrame({'a': np.random.randn(100),
                   'b': np.random.rand(100)})                  
df.head(3)

Unnamed: 0,a,b
0,0.496714,0.417411
1,-0.138264,0.222108
2,0.647689,0.119865


#### Task: save df into a HDF file 'mydata.h5', with name 'obj1', and in a format that supports query

In [17]:
df.to_hdf('mydata.h5',   key='obj1', format='table', data_columns=True)

# query
pd.read_hdf('mydata.h5', key='obj1', where="index<3 & a>0.5") # Note: only viable if data_columns=True above

Unnamed: 0,a,b
2,0.647689,0.119865


#### Reopen the store, and check the contents

In [56]:
h5_store = pd.HDFStore('mydata.h5')
h5_store.keys()
h5_store.close()

['/obj1']

In [48]:
import os
os.remove('mydata.h5')

### Excel Format

In [57]:
# read
file = 'examples/ex1.xlsx'
df = pd.read_excel(file)

# write
file = 'examples/ex2.xlsx'
df.to_excel(file)

#### Write multiple sheets into the same file

In [None]:
# Need to specify an ExcelWriter object
df1 = df.copy()

with pd.ExcelWriter(file) as fh:  
    df.to_excel( fh, sheet_name='Sheet1')
    df1.to_excel(fh, sheet_name='Sheet2')

# append to an existing file, only available for Pandas 0.24 and above
df2 = df.copy()
with pd.ExcelWriter(file, mode='a') as fh:
    df2.to_excel(fh, sheet_name='Sheet3')

In [64]:
import os
os.remove(file)

### <a id='3'> Using Web APIs
HTTP requests and JSON parsing
- <a href='#0'> Back to TOC

#### Task: retrieve the last 30 issues of Pandas on Github

In [60]:
import requests
# Using Github API's HTTP method: api.github.com
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'

# interact with url's API, get a response obj that contains the data feeds
resp = requests.get(url)
resp

# parsing the JSON content from the response into a list of dicts
data = resp.json() 
data[0].keys()

# data to DataFrame
issues = pd.DataFrame(data, columns=['number', 'title', 'labels', 'state'])
issues.head()

<Response [200]>

list

dict_keys(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'pull_request', 'body'])

Unnamed: 0,number,title,labels,state
0,32538,ENH: IntegerArray.astype(dt64),[],open
1,32537,CLN: avoid values_from_object in reshape.merge,[],open
2,32536,TST: separate out pd.crosstab tests from test_...,[],open
3,32535,BUG: retain tz in to_records,[],open
4,32534,CLN: remove unused in pd._testing,[],open


### <a id='4'> Retrieve data from a Database
- <a href='#0'> Back to TOC

#### SQLite3: create a database and insert data

In [18]:
import sqlite3

!rm -f mydata.sqlite

query = """
CREATE TABLE test(
    a VARCHAR(20), 
    b VARCHAR(20), 
    c REAL, 
    d INTEGER 
);
"""
con = sqlite3.connect('mydata.sqlite')
con.execute(query)
con.commit()

# list of tuples as data rows
data = [('Atlanta',     'Georgia',    1.25, 6),
        ('Tallahassee', 'Florida',    2.6,  3),
        ('Sacramento',  'California', 1.7,  5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data);
con.commit();

#### ft. SQLAlchemy: access db
SQLAlchemy makes it look too easy compared to SQLite3's API!

In [19]:
import sqlalchemy as sqla

query = 'select * from test'

# SQLite3
con = sqlite3.connect('mydata.sqlite')
cursor = con.execute(query)
rows   = cursor.fetchall()
pd.DataFrame(rows, columns=[x[0] for x in cursor.description] ) # query results to DataFrame

# SQLAlchemy: much easier to use than SQLite3
con = sqla.create_engine('sqlite:///mydata.sqlite')
pd.read_sql(query, con)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [62]:
!pwd
!ls mydata.sqlite
!rm mydata.sqlite

/Users/William/Downloads/Backup/3-21-2019/ML/ML_ipynb/Py_Data_Analysis-2nd-edition
mydata.sqlite
