# Chapter 5 : Retrieving, Processing, and Storing Data

# Writing CSV files

In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

a = np.random.randn(3, 4)
a[2][2] = np.nan
print(a)
np.savetxt('np.csv', a, fmt='%.2f', delimiter=',', header=" #1, #2,  #3,  #4")
df = pd.DataFrame(a)
print(df)
df.to_csv('pd.csv', float_format='%.2f', na_rep="NAN!")

[[ 0.49671415 -0.1382643   0.64768854  1.52302986]
 [-0.23415337 -0.23413696  1.57921282  0.76743473]
 [-0.46947439  0.54256004         nan -0.46572975]]
          0         1         2         3
0  0.496714 -0.138264  0.647689  1.523030
1 -0.234153 -0.234137  1.579213  0.767435
2 -0.469474  0.542560       NaN -0.465730


# Comparing binary .npy format and pickle format

In [2]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile
from os.path import getsize

np.random.seed(42)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile()
np.savetxt(tmpf, a, delimiter=',')
print("Size CSV file", getsize(tmpf.name))

tmpf = NamedTemporaryFile()
np.save(tmpf, a)
tmpf.seek(0)
loaded = np.load(tmpf)
print("Shape", loaded.shape)
print("Size .npy file", getsize(tmpf.name))

df = pd.DataFrame(a)
df.to_pickle(tmpf.name)
print("Size pickled dataframe", getsize(tmpf.name))
print("DF from pickle\n", pd.read_pickle(tmpf.name))


Size CSV file 36693
Shape (365, 4)
Size .npy file 11808
Size pickled dataframe 12254
DF from pickle
             0         1         2         3
0    0.496714 -0.138264  0.647689  1.523030
1   -0.234153 -0.234137  1.579213  0.767435
2   -0.469474  0.542560 -0.463418 -0.465730
3    0.241962 -1.913280 -1.724918 -0.562288
4   -1.012831  0.314247 -0.908024 -1.412304
5    1.465649 -0.225776  0.067528 -1.424748
6   -0.544383  0.110923 -1.150994  0.375698
7   -0.600639 -0.291694 -0.601707  1.852278
8   -0.013497 -1.057711  0.822545 -1.220844
9    0.208864 -1.959670 -1.328186  0.196861
10   0.738467  0.171368 -0.115648 -0.301104
11  -1.478522 -0.719844 -0.460639  1.057122
12   0.343618 -1.763040  0.324084 -0.385082
13  -0.676922  0.611676  1.031000  0.931280
14  -0.839218 -0.309212  0.331263  0.975545
15  -0.479174 -0.185659 -1.106335 -1.196207
16   0.812526  1.356240 -0.072010  1.003533
17   0.361636 -0.645120  0.361396  1.538037
18  -0.035826  1.564644 -2.619745  0.821903
19   0.087047 -0.29

In [3]:
import numpy as np
import tables
from tempfile import NamedTemporaryFile
from os.path import getsize

np.random.seed(42)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile()
h5file = tables.open_file(tmpf.name, mode='w', title="NumPy Array")
root = h5file.root
h5file.create_array(root, "array", a)
h5file.close()

h5file = tables.open_file(tmpf.name, "r")
print(getsize(tmpf.name))

for node in h5file.root:
   b = node.read()
   print(type(b), b.shape)

h5file.close()

13728
<class 'numpy.ndarray'> (365, 4)


# Reading and writing DataFrames to HDF5

In [5]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile

np.random.seed(42)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile()
store = pd.io.pytables.HDFStore(tmpf.name)
print(store)

df = pd.DataFrame(a)
store['df'] = df
print(store)

print("Get", store.get('df').shape)
print("Lookup", store['df'].shape)
print( "Dotted", store.df.shape)

del store['df']
print("After del\n", store)

print("Before close", store.is_open)
store.close()
print("After close", store.is_open)

df.to_hdf('test.h5', 'data', format='table')
print(pd.read_hdf('test.h5', 'data', where=['index>363']))

<class 'pandas.io.pytables.HDFStore'>
File path: /tmp/tmp39ozrpzo

<class 'pandas.io.pytables.HDFStore'>
File path: /tmp/tmp39ozrpzo

Get (365, 4)
Lookup (365, 4)
Dotted (365, 4)
After del
 <class 'pandas.io.pytables.HDFStore'>
File path: /tmp/tmp39ozrpzo

Before close True
After close False
            0         1         2         3
364  0.753342  0.381158  1.289753  0.673181


# Reading and writing to Excel

In [6]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile

np.random.seed(42)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile(suffix='.xlsx')
df = pd.DataFrame(a)
print(tmpf.name)
df.to_excel(tmpf.name, sheet_name='Random Data')
print("Means\n", pd.read_excel(tmpf.name, 'Random Data').mean())

/tmp/tmp6eb111c4.xlsx
Means
 0    0.037860
1    0.024483
2    0.059836
3    0.058417
dtype: float64


# Using REST and JSON

In [7]:
import json

json_str = '{"country":"Netherlands","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'

data = json.loads(json_str)
print("Country", data["country"])
data["country"] = "Brazil"
print(json.dumps(data))

Country Netherlands
{"country": "Brazil", "dma_code": "0", "timezone": "Europe/Amsterdam", "area_code": "0", "ip": "46.19.37.108", "asn": "AS196752", "continent_code": "EU", "isp": "Tilaa V.O.F.", "longitude": 5.75, "latitude": 52.5, "country_code": "NL", "country_code3": "NLD"}


In [8]:
import pandas as pd

json_str = '{"country":"Netherlands","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'

data = pd.read_json(json_str, typ='series')
print("Series\n", data)

data["country"] = "Brazil"
print("New Series\n", data.to_json())

Series
 area_code                        0
asn                       AS196752
continent_code                  EU
country                Netherlands
country_code                    NL
country_code3                  NLD
dma_code                         0
ip                    46.19.37.108
isp                   Tilaa V.O.F.
latitude                      52.5
longitude                     5.75
timezone          Europe/Amsterdam
dtype: object
New Series
 {"area_code":"0","asn":"AS196752","continent_code":"EU","country":"Brazil","country_code":"NL","country_code3":"NLD","dma_code":"0","ip":"46.19.37.108","isp":"Tilaa V.O.F.","latitude":52.5,"longitude":5.75,"timezone":"Europe\/Amsterdam"}


# Parsing RSS and Atom Feeds

In [10]:
import feedparser as fp

rss = fp.parse("http://www.packtpub.com/rss.xml")

print("# Entries", len(rss.entries))

# Entries 10


In [11]:
for i, entry in enumerate(rss.entries):
   if "Java" in entry.summary:
      print(i, entry.title)
      print(entry.summary)

6 FastTrack to OOP - Classes and Interfaces
<p>In this article by <strong>Mohamed Sanaulla</strong> and <strong>Nick Samoylov</strong>, the authors of <a href="https://www.packtpub.com/application-development/java-9-cookbook">Java 9 Cookbook</a>, we will cover the following recipe:</p>
<ul>
<li>Implementing object-oriented design using classes</li>
</ul>
<p style="margin-left: 40px; margin-right: 40px;" align="center"><em>(For more resources related to this topic, see <a href="https://www.packtpub.com/application-development/java-9-cookbook">here</a>.)</em></p>
<p><span style="background-color: transparent;"><strong>Implementing object-oriented design using classes</strong> </span></p>
<p><span style="background-color: transparent;">In this recipe, you will learn about the first two OOD concepts--object/class and encapsulation. </span></p>
<p><span style="background-color: transparent;"><strong>Getting ready</strong> </span></p>
<p><a href="http://www.packtpub.com/books/content/fasttra

# Parsing HTML with Beautiful Soup

In [12]:
from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(open('loremIpsum.html'),"lxml")

print("First div\n", soup.div)
print("First div class", soup.div['class'])

print("First dfn text", soup.dl.dt.dfn.text)

for link in soup.find_all('a'):
   print("Link text", link.string, "URL", link.get('href'))

# Omitting find_all
for i, div in enumerate(soup('div')):
   print(i, div.contents)


#Div with id=official
official_div = soup.find_all("div", id="official")
print("Official Version", official_div[0].contents[2].strip())

print("# elements with class", len(soup.find_all(class_=True)))

tile_class = soup.find_all("div", class_="tile")
print("# Tile classes", len(tile_class))

print("# Divs with class containing tile", len(soup.find_all("div", class_=re.compile("tile"))))

print("Using CSS selector\n", soup.select('div.notile'))
print("Selecting ordered list list items\n", soup.select("ol > li")[:2])
print("Second list item in ordered list", soup.select("ol > li:nth-of-type(2)"))

print("Searching for text string", soup.find_all(text=re.compile("2014")))

First div
 <div class="tile">
<h4>Development</h4>
     0.10.1 - July 2014<br/>
</div>
First div class ['tile']
First dfn text Quare attende, quaeso.
Link text loripsum.net URL http://loripsum.net/
Link text Poterat autem inpune; URL http://loripsum.net/
Link text Is es profecto tu. URL http://loripsum.net/
0 ['\n', <h4>Development</h4>, '\n     0.10.1 - July 2014', <br/>, '\n']
1 ['\n', <h4>Official Release</h4>, '\n     0.10.0 June 2014', <br/>, '\n']
2 ['\n', <h4>Previous Release</h4>, '\n     0.09.1 June 2013', <br/>, '\n']
Official Version 0.10.0 June 2014
# elements with class 3
# Tile classes 2
# Divs with class containing tile 3
Using CSS selector
 [<div class="notile">
<h4>Previous Release</h4>
     0.09.1 June 2013<br/>
</div>]
Selecting ordered list list items
 [<li>Cur id non ita fit?</li>, <li>In qua si nihil est praeter rationem, sit in una virtute finis bonorum;</li>]
Second list item in ordered list [<li>In qua si nihil est praeter rationem, sit in una virtute finis bon