# Data Loading, Storage, and File Formats
Reading data and making it accessable - data loading. 

## 6.1 Reading and Writing data in Text Format


In [None]:
import pandas as pd
import numpy as np

In [None]:
# Use names to assign default column names
# df = pd.read_csv('./datasets/ex1.csv', names=['a', 'b', 'c', 'd', 'message'])

# header=None will ignore headers and use number to represent the column index
# df = pd.read_csv('./datasets/ex1.csv', header=None)

# Select one of the column as index label 
df=pd.read_csv('./datasets/ex1.csv', index_col="message")

# Hierarchical index
# df = pd.read_csv("./datasets/csv_mindex.csv", index_col=["key1", "key2"])

# If a table not have a fixed delimiter (example is a txt file, saperated by one or multiple white space)
# Can also be read as csv
# result = pd.read_csv("examples/ex3.txt", sep="\s+")

# If there are rows in the csv file that need to be skipped
# pd.read_csv("./datasets/ex1.csv", skiprows=[0, 2, 3])





In [None]:
# Keep_default=False - don't auto convert to NA value, only convert nan valye based on the na_values
#pd.read_csv("some-file", keep_default=False, na_values=["NA"])

# For "message" column, foo and na will be converted to NaN, on "something" column, "two" will be converted to NaN
sentinels = {"message": ["foo", "na"], "something":["two"]}
#pd.read_csv("some-example", na_values=sentinels, keep_default_na=False)



### Some pandas.read_csv function arguments
| Argument | Description |
|-|-| 
| path | String indicating file system location |
| sep or delimiter | character sequence used to split fields in each row |
| header | Row number to use as column names, default is 0 |
| index_col | Column numbers or names to use as the row index |
| names | list of column names for result |
| skiprows | number of rows at beginning of file to ignore or list of row numbers to skip |
| na_values | Sequence of values to replace with NA. |
| keep_default_na | whether to use the default NA value list or not |
| comment | characters to split comments off the end of lines |
| nrows | Number of rows to read from beginning of file |



In [None]:
chunker = pd.read_csv("./datasets/ex1.csv", chunksize=1000)

tot = pd.Series([], dtype="int64")
# for piece in chunker:
# 	tot = tot.add(piece["a"].value_counts(), fill_value=0)

for p in chunker:
	print(p)

### Writing data to text format

In [None]:
df = pd.DataFrame(
    np.arange(20).reshape((4, 5)),
    index=["a", "b", "c", "d"],
    columns=["u", "v", "w", "x", "y"],
)

df[df<5].to_csv("./datasets/out.csv", na_rep='ehhhh', sep="|")


# df.to_csv("./datasets/out.csv", na_rep='ehhhh')


### Working with Other Delimited Formats


In [None]:
import csv 


with open("./datasets/ex7.csv") as f:
	lines = list(csv.reader(f))
	header, values = lines[0], lines[1:]

# Create a dictionary of data columns using da dictionary comprehension.
data_dict = {h: v for h, v in zip(header, zip(*values))}

In [None]:
data_dict

In [None]:
lst = [("a",1), ("b",2)]

# Reverse the iterable of tuples and 2 iterables
list(zip(*lst))
list(zip(('a', 'b'), (1, 2)))

In [None]:
class my_dialect(csv.Dialect):
	lineterminator= "\n"
	delimiter= ";"
	quotechar= '"'
	quoting = csv.QUOTE_ALL

f =  open("./datasets/ex7.csv")

reader = csv.reader(f, dialect=my_dialect)

list(reader)

In [None]:
# to write delimited files manually
with open("mydata.csv", "w") as f:
	writer = csv.writer(f, dialect=my_dialect)
	writer.writerow(("one", "two", "three"))
	writer.writerow(("1", "2", 3))

### JSON Data


In [None]:
import json

obj = """
{"name":"Wes", "pet":null }
"""

# Convert json string to dictionary
result = json.loads(obj)

result

asjson = json.dumps(result)

asjson

# Save dataframe as json object
df.to_json(sys.stdout)

### XML and HTML : Wbe Scraping

pandas.read_html by default searches for and attempt to parse all tabular data contained within <table> tags
The result is a list of DataFrame objects

#### Parsing XML with lxml.objectify


In [None]:
from lxml import objectify

path = "datasets/mta_perf/Performance_MNR.xml"

with open(path) as f:
	parsed = objectify.parse(f)

# Get the root tag of the data set
root =  parsed.getroot()

root.tag

data = []

skip_fields = ["PARENT_SEQ", "INDICATOR_SEQ"]

# Populate a dictionary of tag names to data values
for elt in root.INDICATOR:
	el_data = {}
	for child in elt.getchildren():
		if child.tag in skip_fields:
			continue
		el_data[child.tag] = child.pyval
	data.append(el_data)

# Convert the list of name to data values pair to data frame
perf = pd.DataFrame(data)

# Above is same as this one liner below
pd.read_xml("./datasets/mta_perf/Performance_MNR.xml")


## 6.2 Binary Data Formats
Store / serialize data in binary format by using Python's pickle module

Pandas object all have a to_pickle method that writes the data to disk in pickle format

### Using HDF5 Format
HDF5 is intended to store large quantities of scientific array data. 
HDF - Hiearachical data format

HDF5 is NOT a database, it is best suited for write-once, read-many datasets.
 

In [None]:
frame = pd.DataFrame({"a": np.random.standard_normal(100)})

store = pd.HDFStore("datasets/mydata.h5")

store['obj1'] = frame


store["obj1_col"] = frame["a"]

store['obj1']

In [None]:

# HTFStore supports "fixed" and "table" storage schemas

store.put("obj2", frame, format="table")

In [None]:

store.select("obj2", where=["index >= 10 and index <= 15"])

In [None]:
store.close()

## 6.3 Interacting with Wbe APIs
 

In [None]:
import requests
url = "http://api.github.com/repos/pandas-dev/pandas/issues"

resp = requests.get(url)

resp.raise_for_status()

In [None]:
data = resp.json()

data[0]['title']

issues = pd.DataFrame(data, columns=["number", "title", "labels", "state"])

## 6.4 Interacting with Databases


In [None]:
# Create a SQLite3 data base 

import sqlite3

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER)
"""

con = sqlite3.connect('mydata.sqlite')

con.execute(query)

In [None]:
data = [("Atlana", "Georgia", 1.25, 6),
				("Tallahassee", "Florida", 2.6, 3)
]

stmt = "INSERT INTO test VALUES(?,?,?,?)"

con.executemany(stmt, data)

cursor = con.execute("SELECT * FROM test")

In [None]:
rows = cursor.fetchall()

In [None]:
# Show each of the queried rows
rows

In [None]:
# Show the columns
cursor.description

In [None]:
import sqlalchemy as sqla

db = sqla.create_engine("sqlite:///mydata.sqlite")

In [None]:
args, kwargs = db.dialect.create_connect_args(db.url)


In [None]:
pd.read_sql("SELECT * FROM test", db)