In [5]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
pd.options.display.max_colwidth = 75 #[yw]col width not exceeding 75chars
pd.options.display.max_columns = 20
np.set_printoptions(precision=4, suppress=True)

## 6.1 Reading and Writing Data in Text Format

Table 6.1: Text and binary data loading functions in pandas

Function|	Description
|:--------------|:--------------------------------------------------|
read_csv|	Load delimited data from a file, URL, or file-like object; use comma as default delimiter
read_fwf|	Read data in fixed-width column format (i.e., no delimiters)
read_clipboard	|Variation of read_csv that reads data from the clipboard; useful for converting tables from web pages
read_excel|	Read tabular data from an Excel XLS or XLSX file
read_hdf|	Read HDF5 files written by pandas
read_html|	Read all tables found in the given HTML document
read_json	|Read data from a JSON (JavaScript Object Notation) string representation, file, URL, or file-like object
read_feather|	Read the Feather binary file format
read_orc|	Read the Apache ORC binary file format
read_parquet	|Read the Apache Parquet binary file format
read_pickle	|Read an object stored by pandas using the Python pickle format
read_sas|	Read a SAS dataset stored in one of the SAS system's custom storage formats
read_spss|	Read a data file created by SPSS
read_sql|	Read the results of a SQL query (using SQLAlchemy)
read_sql_table	|Read a whole SQL table (using SQLAlchemy); equivalent to using a query that selects everything in that table using read_sql
read_stata|	Read a dataset from Stata file format
read_xml|	Read a table of data from an XML file

Indexing

Can treat one or more columns as the returned DataFrame, and whether to get column names from the file, arguments you provide, or not at all.

Type inference and data conversion

Includes the user-defined value conversions and custom list of missing value markers.

Date and time parsing

Includes a combining capability, including combining date and time information spread over multiple columns into a single column in the result.

Iterating

Support for iterating over chunks of very large files.

Unclean data issues

Includes skipping rows or a footer, comments, or other minor things like numeric data with thousands separated by commas.

In [3]:
#!cat examples/ex1.csv # Use type on Windows
!type examples\ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [6]:
df = pd.read_csv("examples/ex1.csv")
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
!cat examples/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [8]:
pd.read_csv("examples/ex2.csv")

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [7]:
pd.read_csv("examples/ex2.csv", header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [9]:

pd.read_csv("examples/ex2.csv", names=["a", "b", "c", "d", "message"])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [10]:
names = ["a", "b", "c", "d", "message"]
pd.read_csv("examples/ex2.csv", names=names, index_col="message")

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


Alternatively

In [12]:
pd.read_csv("examples/ex2.csv", names=names, index_col=4)

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [9]:
!cat examples/csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [13]:
# Hierarchical (Multi-level) index
parsed = pd.read_csv("examples/csv_mindex.csv",
                     index_col=["key1", "key2"])
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [14]:
parsed.index

MultiIndex([('one', 'a'),
            ('one', 'b'),
            ('one', 'c'),
            ('one', 'd'),
            ('two', 'a'),
            ('two', 'b'),
            ('two', 'c'),
            ('two', 'd')],
           names=['key1', 'key2'])

In [12]:
!cat examples/ex3.txt

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


In [16]:
result = pd.read_csv("examples/ex3.txt", sep="\s+") 
#regex: \s: space, +: 1 or more
result # note pandas infer the row index and col names

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [14]:
!cat examples/ex4.csv

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [17]:
pd.read_csv("examples/ex4.csv")#, skiprows=[0, 2, 3])

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,# hey!
a,b,c,d,message
# just wanted to make things more difficult for you,,,,
# who reads CSV files with computers,anyway?,,,
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [18]:
pd.read_csv("examples/ex4.csv", skiprows=[0, 2, 3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [16]:
!cat examples/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

Note there is a missing value in the second line

In [2]:
import pandas as pd
import numpy as np

In [19]:

result = pd.read_csv("examples/ex5.csv")
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [20]:
pd.isna(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


The `na_values` option accepts a sequence of strings to add to the default list of strings recognized as missing:

In [21]:
result = pd.read_csv("examples/ex5.csv", na_values=["NULL"])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


`pandas.read_csv` has a list of many default `NA` value representations, but these defaults can be disabled with the `keep_default_na` option:

In [22]:
result2 = pd.read_csv("examples/ex5.csv", keep_default_na=False)
result2 # read NA as a string

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [23]:
result2.isna()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False


In [24]:
result2.info() #note col "c" dtype is "object"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   something  3 non-null      object
 1   a          3 non-null      int64 
 2   b          3 non-null      int64 
 3   c          3 non-null      object
 4   d          3 non-null      int64 
 5   message    3 non-null      object
dtypes: int64(3), object(3)
memory usage: 276.0+ bytes


In [25]:
result3 = pd.read_csv("examples/ex5.csv", keep_default_na=False,
                      na_values=["NA"])
result3


Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [26]:
result3.isna()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,False,False,False
2,False,False,False,False,False,False


In [27]:
result3.info() # note col "c" is object dtype

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   something  3 non-null      object
 1   a          3 non-null      int64 
 2   b          3 non-null      int64 
 3   c          3 non-null      object
 4   d          3 non-null      int64 
 5   message    2 non-null      object
dtypes: int64(3), object(3)
memory usage: 276.0+ bytes


Different NA sentinels can be specified for each column in a dictionary:

In [28]:
!cat examples/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [28]:
sentinels = {"message": ["foo", "NA"], "something": ["two"]}
pd.read_csv("examples/ex5.csv", na_values=sentinels,
            keep_default_na=False)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


Table 6.2: Some pandas.read_csv function arguments

Argument|	Description
|:-------------|:------------------------------------------------|
path	|String indicating filesystem location, URL, or file-like object.
sep or delimiter	|Character sequence or regular expression to use to split fields in each row.
header|	Row number to use as column names; defaults to 0 (first row), but should be None if there is no header row.
index_col|	Column numbers or names to use as the row index in the result; can be a single name/number or a list of them for a hierarchical index.
names	|List of column names for result.
skiprows|	Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip.
na_values|	Sequence of values to replace with NA. They are added to the default list unless keep_default_na=False is passed.
keep_default_na	|Whether to use the default NA value list or not (True by default).
comment	|Character(s) to split comments off the end of lines.
parse_dates	|Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise, can specify a list of column numbers or names to parse. If element of list is tuple or list, will combine multiple columns together and parse to date (e.g., if date/time split across two columns).
keep_date_col	|If joining columns to parse date, keep the joined columns; False by default.
converters|	Dictionary containing column number or name mapping to functions (e.g., {"foo": f} would apply the function f to all values in the "foo" column).
dayfirst|	When parsing potentially ambiguous dates, treat as international format (e.g., 7/6/2012 -> June 7, 2012); False by default.
date_parser	|Function to use to parse dates.
nrows	|Number of rows to read from beginning of file (not counting the header).
iterator|	Return a TextFileReader object for reading the file piecemeal. This object can also be used with the with statement.
chunksize|	For iteration, size of file chunks.
skip_footer	|Number of lines to ignore at end of file.
verbose	|Print various parsing information, like the time spent in each stage of the file conversion and memory use information.
encoding|	Text encoding (e.g., "utf-8 for UTF-8 encoded text). Defaults to "utf-8" if None.
squeeze	|If the parsed data contains only one column, return a Series.
thousands|	Separator for thousands (e.g., "," or "."); default is None.
decimal|	Decimal separator in numbers (e.g., "." or ","); default is ".".
engine|	CSV parsing and conversion engine to use; can be one of "c", "python", or "pyarrow". The default is "c", though the newer "pyarrow" engine can parse some files much faster. The "python" engine is slower but supports some features that the other engines do not.

### Reading Text Files in Pieces

In [29]:
pd.options.display.max_rows = 10

In [30]:
result = pd.read_csv("examples/ex6.csv")
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [31]:
pd.read_csv("examples/ex6.csv", nrows=5) #read only 5 rows

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


To read a file in pieces, specify a chunksize as a number of rows:

In [32]:
chunker = pd.read_csv("examples/ex6.csv", chunksize=1000)
type(chunker)

pandas.io.parsers.readers.TextFileReader

In [33]:
result["key"].value_counts()

key
E    368
X    364
L    346
O    343
Q    340
    ... 
5    157
2    152
0    151
9    150
1    146
Name: count, Length: 36, dtype: int64

In [34]:
np.unique(result.key)

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C',
       'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
       'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], dtype=object)

In [35]:
chunker = pd.read_csv("examples/ex6.csv", chunksize=1000)

tot = pd.Series([], dtype='int64')
k=0
for piece in chunker:
    if k ==0:
        print(piece["key"].value_counts())
    tot = tot.add(piece["key"].value_counts(), fill_value=0)
    if k==0: print(tot)
    k +=1

tot = tot.sort_values(ascending=False)

key
S    48
O    44
F    40
H    39
Q    39
     ..
1    13
2    11
9    11
0     9
5     9
Name: count, Length: 36, dtype: int64
key
S    48.0
O    44.0
F    40.0
H    39.0
Q    39.0
     ... 
1    13.0
2    11.0
9    11.0
0     9.0
5     9.0
Length: 36, dtype: float64


In [36]:
tot[:10]

key
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

In [36]:
data = pd.read_csv("examples/ex5.csv")
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


### Writing Data to Text Format

In [37]:
data.to_csv("examples/out.csv")


In [6]:
!cat examples/out.csv
# note missing values appears as empty space. 

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [38]:
import sys
data.to_csv(sys.stdout, sep="|") #to console

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [39]:
data.to_csv(sys.stdout, na_rep="NULL") # use "NULL" for missing values

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [40]:
data.to_csv(sys.stdout, index=False, header=False)
#disable row and col labels

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


You can also write only a subset of the columns, and in an order of your choosing:

In [41]:
# output a subset of columns
data.to_csv(sys.stdout, index=False, columns=["a", "b", "c"])

a,b,c
1,2,3.0
5,6,
9,10,11.0


### Working with Other Delimited Formats

In [11]:
!cat examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"


For any file with a single-character delimiter, you can use Python’s built-in `csv` module. To use it, pass any open file or file-like object to `csv.reader`:

In [42]:
import csv
f = open("examples/ex7.csv")
reader = csv.reader(f)

Iterating through the reader like a file yields lists of values with any quote characters removed:

In [43]:
for line in reader:
    print(line)
f.close()

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [44]:
with open("examples/ex7.csv") as f:
    lines = list(csv.reader(f))
lines

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3']]

In [46]:
header, values = lines[0], lines[1:]
header

['a', 'b', 'c']

In [47]:
values

[['1', '2', '3'], ['1', '2', '3']]

In [48]:
zip(*values)

<zip at 0x29093830f00>

`zip(*values)`: The zip function takes in multiple iterables and returns an iterator of tuples, where each tuple contains the corresponding elements from each iterable. The `*` operator is used to unpack the elements of the values list, effectively passing each internal list as a separate argument to the zip function.

Result: The result is an iterator of tuples, where the first tuple contains the first elements of each of the lists, the second tuple contains the second elements of each of the lists, and so on. This essentially transposes the rows and columns of the original data structure.

For example, consider the following list of lists:

```{python}
values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
```
*values = (1,2,,3), (4,5,6), (7,8,9)

zip(*values) = (1,4,7), (2,5, 8), (3,6, 9)
Using `zip(*values)` on this list would give you:

```
[(1, 4, 7), (2, 5, 8), (3, 6, 9)]
```

In [6]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

### Specify CSV delimiters
CSV files come in many different flavors. To define a new format with a different delimiter, string quoting convention, or line terminator, we could define a simple subclass of csv.Dialect:
```
class my_dialect(csv.Dialect):
    lineterminator = "\n"
    delimiter = ";"
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL

reader = csv.reader(f, dialect=my_dialect)
```
We could also give individual CSV dialect parameters as keywords to csv.reader without having to define a subclass:
```
reader = csv.reader(f, delimiter="|")
```
Table 6.3: CSV dialect options
Argument	|Description
|:------------|:-----------------------------------------------------------------|
delimiter	|One-character string to separate fields; defaults to ",".
lineterminator|	Line terminator for writing; defaults to "\r\n". Reader ignores this and recognizes cross-platform line terminators.
quotechar|	Quote character for fields with special characters (like a delimiter); default is '"'.
quoting	|Quoting convention. Options include csv.QUOTE_ALL (quote all fields), csv.QUOTE_MINIMAL (only fields with special characters like the delimiter), csv.QUOTE_NONNUMERIC, and csv.QUOTE_NONE (no quoting). See Python’s documentation for full details. Defaults to QUOTE_MINIMAL.
skipinitialspace|	Ignore whitespace after each delimiter; default is False.
doublequote|	How to handle quoting character inside a field; if True, it is doubled (see online documentation for full detail and behavior).
escapechar|	String to escape the delimiter if quoting is set to csv.QUOTE_NONE; disabled by default.

To write delimited files manually, you can use csv.writer. It accepts an open, writable file object and the same dialect and format options as csv.reader:
```
with open("mydata.csv", "w") as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(("one", "two", "three"))
    writer.writerow(("1", "2", "3"))
    writer.writerow(("4", "5", "6"))
    writer.writerow(("7", "8", "9"))
```

### JSON  Data
JSON (short for JavaScript Object Notation) has become one of the standard formats for sending data by HTTP request between web browsers and other applications. It is a much more free-form data format than a tabular text form like CSV. Here is an example:

In [49]:
obj = """
{"name": "Wes",
 "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]},
              {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]
}
"""

JSON is very nearly valid Python code with the exception of its null value null and some other nuances (such as disallowing trailing commas at the end of lists). The basic types are objects (dictionaries), arrays (lists), strings, numbers, Booleans, and nulls. All of the keys in an object must be strings. There are several Python libraries for reading and writing JSON data. 

In [50]:
import json
result = json.loads(obj)
result

{'name': 'Wes',
 'cities_lived': ['Akron', 'Nashville', 'New York', 'San Francisco'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 34, 'hobbies': ['guitars', 'soccer']},
  {'name': 'Katie', 'age': 42, 'hobbies': ['diving', 'art']}]}

```
import json
with open('file.json','r') as f:
    obj = json.load(f)

print(obj)    
```

In [51]:
asjson = json.dumps(result) #covert Python obj to JSON
asjson

'{"name": "Wes", "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"], "pet": null, "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]}, {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]}'

In [11]:
import pandas as pd

In [12]:
siblings = pd.DataFrame(result["siblings"], columns=["name", "age"])
siblings

Unnamed: 0,name,age
0,Scott,34
1,Katie,42


In [4]:
# !cat examples/example.json
!type "examples\example.json" #Windows
# Note it is a JSON list using [ ]

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]


In [6]:
import pandas as pd

In [7]:
data = pd.read_json("examples/example.json") #assumes each obj is a row
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [8]:
import sys
data.to_json(sys.stdout) # by col (default)

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}

In [9]:

data.to_json(sys.stdout, orient="records") # by row

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]

In [1]:
import pandas as pd

### XML and HTML: Web Scraping

In [None]:
#!conda install lxml beautifulsoup4 html5lib

In [2]:
tables = pd.read_html("examples/fdic_failed_bank_list.html")
len(tables)


1

In [3]:
tables

[                             Bank Name             City  ST   CERT  \
 0                          Allied Bank         Mulberry  AR     91   
 1         The Woodbury Banking Company         Woodbury  GA  11297   
 2               First CornerStone Bank  King of Prussia  PA  35312   
 3                   Trust Company Bank          Memphis  TN   9956   
 4           North Milwaukee State Bank        Milwaukee  WI  20364   
 ..                                 ...              ...  ..    ...   
 542                 Superior Bank, FSB         Hinsdale  IL  32646   
 543                Malta National Bank            Malta  OH   6629   
 544    First Alliance Bank & Trust Co.       Manchester  NH  34264   
 545  National State Bank of Metropolis       Metropolis  IL   3815   
 546                   Bank of Honolulu         Honolulu  HI  21029   
 
                    Acquiring Institution        Closing Date  \
 0                           Today's Bank  September 23, 2016   
 1              

In [4]:
failures = tables[0]


In [5]:
failures.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [6]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 547 entries, 0 to 546
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Bank Name              547 non-null    object
 1   City                   547 non-null    object
 2   ST                     547 non-null    object
 3   CERT                   547 non-null    int64 
 4   Acquiring Institution  547 non-null    object
 5   Closing Date           547 non-null    object
 6   Updated Date           547 non-null    object
dtypes: int64(1), object(6)
memory usage: 30.0+ KB


In [8]:
close_timestamps = pd.to_datetime(failures["Closing Date"])
close_timestamps


0     2016-09-23
1     2016-08-19
2     2016-05-06
3     2016-04-29
4     2016-03-11
         ...    
542   2001-07-27
543   2001-05-03
544   2001-02-02
545   2000-12-14
546   2000-10-13
Name: Closing Date, Length: 547, dtype: datetime64[ns]

In [9]:
close_timestamps.dt.year.value_counts()

Closing Date
2010    157
2009    140
2011     92
2012     51
2008     25
2013     24
2014     18
2002     11
2015      8
2016      5
2004      4
2001      4
2007      3
2003      3
2000      2
Name: count, dtype: int64

### Parsing XML with lxml.objectify

In [11]:
from lxml import objectify

path = "datasets/mta_perf/Performance_MNR.xml"
with open(path) as f:
    parsed = objectify.parse(f)
root = parsed.getroot()
root

<Element PERFORMANCE at 0x24ef14489c0>

`root.INDICATOR` returns a generator yielding each <INDICATOR> XML element. For each record, we can populate a dictionary of tag names (like YTD_ACTUAL) to data values (excluding a few tags) by running the following code:

In [12]:
data = []

skip_fields = ["PARENT_SEQ", "INDICATOR_SEQ",
               "DESIRED_CHANGE", "DECIMAL_PLACES"]

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

In [13]:
perf = pd.DataFrame(data)
perf.head()

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,1,Service Indicators,M,%,95.0,96.9,95.0,96.9
1,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,2,Service Indicators,M,%,95.0,96.0,95.0,95.0
2,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,3,Service Indicators,M,%,95.0,96.3,95.0,96.9
3,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,4,Service Indicators,M,%,95.0,96.8,95.0,98.3
4,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,5,Service Indicators,M,%,95.0,96.6,95.0,95.8


In [14]:
perf2 = pd.read_xml(path)
perf2.head()

Unnamed: 0,INDICATOR_SEQ,PARENT_SEQ,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,DESIRED_CHANGE,INDICATOR_UNIT,DECIMAL_PLACES,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,28445,,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,1,Service Indicators,M,U,%,1,95.0,96.9,95.0,96.9
1,28445,,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,2,Service Indicators,M,U,%,1,95.0,96.0,95.0,95.0
2,28445,,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,3,Service Indicators,M,U,%,1,95.0,96.3,95.0,96.9
3,28445,,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,4,Service Indicators,M,U,%,1,95.0,96.8,95.0,98.3
4,28445,,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,5,Service Indicators,M,U,%,1,95.0,96.6,95.0,95.8


## Binary Data Formats

One simple way to store (or serialize) data in binary format is using Python’s built-in `pickle` module. pandas objects all have a `to_pickle` method that writes the data to disk in pickle format:

In [15]:
frame = pd.read_csv("examples/ex1.csv")
frame


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [16]:
frame.to_pickle("examples/frame_pickle")

Pickle files are in general readable only in Python. You can read any "pickled" object stored in a file by using the built-in pickle directly, or even more conveniently using pandas.read_pickle:

In [17]:
pd.read_pickle("examples/frame_pickle")

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [20]:
#!rm examples/frame_pickle
!del examples\frame_pickle # Windows

pandas has built-in support for several other open source binary data formats, such as `HDF5`, `ORC`, and `Apache Parquet`. For example, if you install the pyarrow package (`conda install pyarrow`), then you can read Parquet files with `pandas.read_parquet`:

In [22]:
fec = pd.read_parquet('datasets/fec/fec.parquet')
fec

Unnamed: 0,cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num
0,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010290,RETIRED,RETIRED,250.0,20-JUN-11,,,,SA17A,736166
1,C00410118,P20002978,"Bachmann, Michelle","HARVEY, WILLIAM",MOBILE,AL,366010290,RETIRED,RETIRED,50.0,23-JUN-11,,,,SA17A,736166
2,C00410118,P20002978,"Bachmann, Michelle","SMITH, LANIER",LANETT,AL,368633403,INFORMATION REQUESTED,INFORMATION REQUESTED,250.0,05-JUL-11,,,,SA17A,749073
3,C00410118,P20002978,"Bachmann, Michelle","BLEVINS, DARONDA",PIGGOTT,AR,724548253,NONE,RETIRED,250.0,01-AUG-11,,,,SA17A,749073
4,C00410118,P20002978,"Bachmann, Michelle","WARDENBURG, HAROLD",HOT SPRINGS NATION,AR,719016467,NONE,RETIRED,300.0,20-JUN-11,,,,SA17A,736166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001726,C00500587,P20003281,"Perry, Rick","GORMAN, CHRIS D. MR.",INFO REQUESTED,XX,99999,INFORMATION REQUESTED PER BEST EFFORTS,INFORMATION REQUESTED PER BEST EFFORTS,5000.0,29-SEP-11,REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...,,REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...,SA17A,751678
1001727,C00500587,P20003281,"Perry, Rick","DUFFY, DAVID A. MR.",INFO REQUESTED,XX,99999,DUFFY EQUIPMENT COMPANY INC.,BUSINESS OWNER,2500.0,30-SEP-11,,,,SA17A,751678
1001728,C00500587,P20003281,"Perry, Rick","GRANE, BRYAN F. MR.",INFO REQUESTED,XX,99999,INFORMATION REQUESTED PER BEST EFFORTS,INFORMATION REQUESTED PER BEST EFFORTS,500.0,29-SEP-11,,,,SA17A,751678
1001729,C00500587,P20003281,"Perry, Rick","TOLBERT, DARYL MR.",INFO REQUESTED,XX,99999,T.A.C.C.,LONGWALL MAINTENANCE FOREMAN,500.0,30-SEP-11,,,,SA17A,751678


### Reading Microsoft Excel Files

### Install 
!conda install openpyxl

In [24]:
xlsx = pd.ExcelFile("examples/ex1.xlsx")

In [25]:
xlsx.sheet_names

['Sheet1']

In [26]:
xlsx

<pandas.io.excel._base.ExcelFile at 0x24ef162cc90>

In [27]:
xlsx.parse(sheet_name="Sheet1")

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [28]:
xlsx.parse(sheet_name="Sheet1", index_col=0)

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [29]:
frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1")
frame

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


To write pandas data to Excel format, one way is to  first create an ExcelWriter, then write data to it using the pandas object's `to_excel` method:

In [30]:
writer = pd.ExcelWriter("examples/ex2.xlsx")
frame.to_excel(writer, "Sheet1")
writer.close()

In [31]:
frame.to_excel("examples/ex2.xlsx")

In [33]:
#!rm examples/ex2.xlsx
!del examples\ex2.xlsx

### Using HDF5 Format
HDF5 is a respected file format intended for storing large quantities of scientific array data. It is available as a C library, and it has interfaces available in many other languages, including Java, Julia, MATLAB, and Python. The “HDF” in HDF5 stands for hierarchical data format. Each HDF5 file can store multiple datasets and supporting metadata. Compared with simpler formats, HDF5 supports on-the-fly compression with a variety of compression modes, enabling data with repeated patterns to be stored more efficiently. HDF5 can be a good choice for working with datasets that don't fit into memory, as you can efficiently read and write small sections of much larger arrays.

To get started with HDF5 and pandas, you must first install PyTables by installing the tables package with conda:
```
conda install pytables
```

Note that the PyTables package is called "tables" in PyPI, so if you install with pip you will have to run `pip install tables`.

In [35]:
# !rm -f examples/mydata.h5
!del /f examples\mydata.h5

Could Not Find e:\OneDrive - Auburn University Montgomery\teaching\AUM\STAT 1010 Introduction to Data Science\Lectures\lecturenotes\examples\mydata.h5


In [37]:
import numpy as np

In [38]:
frame = pd.DataFrame({"a": np.random.standard_normal(100)})
frame

Unnamed: 0,a
0,-0.325638
1,0.537169
2,-1.822306
3,-2.523819
4,0.311732
...,...
95,-0.100347
96,1.291238
97,0.101749
98,1.450877


In [44]:

store = pd.HDFStore("examples/mydata.h5")
store["obj1"] = frame
store["obj1_col"] = frame["a"]
store

<class 'pandas.io.pytables.HDFStore'>
File path: examples/mydata.h5

In [45]:
store["obj1"]

Unnamed: 0,a
0,-0.325638
1,0.537169
2,-1.822306
3,-2.523819
4,0.311732
...,...
95,-0.100347
96,1.291238
97,0.101749
98,1.450877


HDFStore supports two storage schemas, "fixed" and "table" (the default is "fixed"). The latter is generally slower, but it supports query operations using a special syntax. 

The `put` is an explicit version of the `store["obj2"] = frame` method but allows us to set other options like the storage format.

In [46]:
store.put("obj2", frame, format="table")
store.select("obj2", where=["index >= 10 and index <= 15"])


Unnamed: 0,a
10,-1.032875
11,-0.077081
12,-0.747192
13,-0.920171
14,-0.036376
15,-0.063467


In [47]:
store["obj2"]

Unnamed: 0,a
0,-0.325638
1,0.537169
2,-1.822306
3,-2.523819
4,0.311732
...,...
95,-0.100347
96,1.291238
97,0.101749
98,1.450877


In [48]:
store["obj1"]

Unnamed: 0,a
0,-0.325638
1,0.537169
2,-1.822306
3,-2.523819
4,0.311732
...,...
95,-0.100347
96,1.291238
97,0.101749
98,1.450877


In [49]:
store.close()

In [50]:
frame.to_hdf("examples/mydata.h5", "obj3", format="table")
pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"])

Unnamed: 0,a
0,-0.325638
1,0.537169
2,-1.822306
3,-2.523819
4,0.311732


In [51]:
import os
os.remove("examples/mydata.h5")

HDF5 is not a database. It is best suited for write-once, read-many datasets. While data can be added to a file at any time, if multiple writers do so simultaneously, the file can become corrupted.

## 6.3 Interacting with Web APIs
 Install in the venv 
 
 ```
 conda install requests
 ```

Example: To find the last 30 GitHub issues for pandas on GitHub, we can make a GET HTTP request using the add-on requests library:

<font color='red'>Note</font>

:::{.callout-note}
Note that there are five types of callouts, including: 
`note`, `tip`, `warning`, `caution`, and `important`.


In [52]:
import requests
url = "https://api.github.com/repos/pandas-dev/pandas/issues"
resp = requests.get(url)
resp.raise_for_status()
resp


<Response [200]>

It's a good practice to always call raise_for_status after using requests.get to check for HTTP errors.

The response object’s json method will return a Python object containing the parsed JSON data as a dictionary or list (depending on what JSON is returned):

In [54]:
data = resp.json()
data

[{'url': 'https://api.github.com/repos/pandas-dev/pandas/issues/55607',
  'repository_url': 'https://api.github.com/repos/pandas-dev/pandas',
  'labels_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/55607/labels{/name}',
  'comments_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/55607/comments',
  'events_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/55607/events',
  'html_url': 'https://github.com/pandas-dev/pandas/pull/55607',
  'id': 1954538147,
  'node_id': 'PR_kwDOAA0YD85dZUhv',
  'number': 55607,
  'title': 'REF: initialize Series._name for class instead of after _from_mgr',
  'user': {'login': 'jorisvandenbossche',
   'id': 1020496,
   'node_id': 'MDQ6VXNlcjEwMjA0OTY=',
   'avatar_url': 'https://avatars.githubusercontent.com/u/1020496?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/jorisvandenbossche',
   'html_url': 'https://github.com/jorisvandenbossche',
   'followers_url': 'https://api.github.com/users/jorisvande

In [55]:
data[0]["title"]

'REF: initialize Series._name for class instead of after _from_mgr'

In [56]:
issues = pd.DataFrame(data, columns=["number", "title",
                                     "labels", "state"])
issues

Unnamed: 0,number,title,labels,state
0,55607,REF: initialize Series._name for class instead...,[],open
1,55606,BUG: regression in read_parquet that raises a ...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
2,55605,BUG: hash_array does not produce deterministic...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
3,55604,BUG: using iloc/loc to set a nullable int type...,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
4,55603,TST: move misplaced tests,[],open
5,55601,BUG(?) `Index.__getitem__` with 0D numpy array...,"[{'id': 2822098, 'node_id': 'MDU6TGFiZWwyODIyM...",open
6,55598,ENH: Allow rank to return int64 for numpy type...,"[{'id': 76812, 'node_id': 'MDU6TGFiZWw3NjgxMg=...",open
7,55594,BLD: Allow building with NumPy nightlies and u...,"[{'id': 129350, 'node_id': 'MDU6TGFiZWwxMjkzNT...",open
8,55592,ENH: functions filter_columns and filter_rows ...,[],open
9,55591,DEPR: fix stacklevel for DataFrame(mgr) deprec...,"[{'id': 49094459, 'node_id': 'MDU6TGFiZWw0OTA5...",open


## Interacting with Databases

In [None]:
import sqlite3 
# create a SQLite3 database

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""

con = sqlite3.connect("mydata.sqlite") # establish a connetion to the database in the file mydata.sqlite (or create it)
con.execute(query) #execute single SQL command
con.commit() #saves the changes made to the database

In [59]:
data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)" # ? means data will be provided later

con.executemany(stmt, data) #execute the same SQL command multiple times for each tuple in the list
con.commit()

Most Python SQL drivers return a list of tuples when selecting data from a table:

In [60]:
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

You can pass the list of tuples to the DataFrame constructor, but you also need the column names, contained in the cursor’s description attribute. Note that for SQLite3, the cursor description only provides column names (the other fields, which are part of Python's Database API specification, are None), but for some other database drivers, more column information is provided:

In [61]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [62]:

pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


This is quite a bit of munging that you’d rather not repeat each time you query the database. The SQLAlchemy project is a popular Python SQL toolkit that abstracts away many of the common differences between SQL databases. pandas has a `read_sql` function that enables you to read data easily from a general SQLAlchemy connection. You can install SQLAlchemy with conda like so:
```
conda install sqlalchemy
```

In [63]:
import sqlalchemy as sqla
db = sqla.create_engine("sqlite:///mydata.sqlite")
pd.read_sql("SELECT * FROM test", db)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


In [68]:
# !rm mydata.sqlite
!del /F mydata.sqlite


e:\OneDrive - Auburn University Montgomery\teaching\AUM\STAT 1010 Introduction to Data Science\Lectures\lecturenotes\mydata.sqlite


The process cannot access the file because it is being used by another process.
