# DSC350 - Week 8 - Exercise 8.2

We begin the exercises this week by importing the necessary libraries and files.

In [1]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile
from os.path import getsize
import json
from bs4 import BeautifulSoup
import re
import warnings
warnings.filterwarnings('ignore')

## Chapter 5 : Retrieving, Processing, and Storing Data

## Writing CSV files

**1. Generate a 3X4 NumPy array after seeing the random generator.**
 - a) Save the array as a CSV named "np.csv".
 - b) View the np field with the cat command (doesn't need to be included in code, just so you can verify the file looks correct).
 - c) Create a DataFrame from the file and print the results.
 - d) Write the DataFrame to a CSV file.

In [2]:
# Use NumPy random generator
np.random.seed(42)

# Generate a 3X4 array
a = np.random.randn(3, 4)
a[2][2] = np.nan
print(a)
# Save array as CSV with defined file name
np.savetxt('np.csv', a, fmt='%.2f', delimiter=',', header=" #1, #2,  #3,  #4")
# Create dataframe from file and display results
df = pd.DataFrame(a)
print(df)
# Write newly created dataframe to CSV file
df.to_csv('pd.csv', float_format='%.2f', na_rep="NAN!")

[[ 0.49671415 -0.1382643   0.64768854  1.52302986]
 [-0.23415337 -0.23413696  1.57921282  0.76743473]
 [-0.46947439  0.54256004         nan -0.46572975]]
          0         1         2         3
0  0.496714 -0.138264  0.647689  1.523030
1 -0.234153 -0.234137  1.579213  0.767435
2 -0.469474  0.542560       NaN -0.465730


## Comparing binary .npy format and pickle format

**2. Generate a 365X4 NumPy array with random values.**
 - a) Store the array in a CSV file and check its size.
 - b) Save the array in the NumPy format, load the array, check its shape and the size of the file.
 - c) Create a DataFrame from this array you have created and write it to a pickle, then retrieve it from the pickle.
 - d) Print the size of the pickle.

In [3]:
# Use NumPy to generate a 365X4 array with random values
np.random.seed(42)
a = np.random.randn(365, 4)

# Store array as CSV and check size
with NamedTemporaryFile(delete=False, suffix='.csv') as tmp_csv:
    np.savetxt(tmp_csv.name, a, delimiter=',')
    csv_size = getsize(tmp_csv.name)

print(f"CSV file size: {csv_size} bytes")

# Save array in NumPy format
with NamedTemporaryFile(delete=False, suffix='.npy') as tmp_npy:
    np.save(tmp_npy.name, a)
    npy_size = getsize(tmp_npy.name)

# Load array from NumPy and check its shape/file size
loaded_array = np.load(tmp_npy.name)
array_shape = loaded_array.shape

print(f"Array shape: {array_shape}")
print(f"NumPy file size: {npy_size} bytes")

# Create dataframe from the array
df = pd.DataFrame(loaded_array)

# Write dataframe to pickle and retrieve it
with NamedTemporaryFile(delete=False, suffix='.pkl') as tmp_pkl:
    df.to_pickle(tmp_pkl.name)
    pickle_size = getsize(tmp_pkl.name)

# Load dataframe from pickle
loaded_df = pd.read_pickle(tmp_pkl.name)

# Display results
print(f"Pickle file size: {pickle_size} bytes")

CSV file size: 37562 bytes
Array shape: (365, 4)
NumPy file size: 11808 bytes
Pickle file size: 12239 bytes


## Reading and writing to Excel

**3. Using the array created in #2, create an excel file with this data.**
 - a) After that is complete, create a dataframe from the excel.
 - b) Print your results.

In [4]:
# Create Excel file
with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_excel:
    df.to_excel(tmp_excel.name, index=False)
    excel_file_path = tmp_excel.name

# Create dataframe from Excel file and display results
df_from_excel = pd.read_excel(excel_file_path)
print(df_from_excel)

            0         1         2         3
0    0.496714 -0.138264  0.647689  1.523030
1   -0.234153 -0.234137  1.579213  0.767435
2   -0.469474  0.542560 -0.463418 -0.465730
3    0.241962 -1.913280 -1.724918 -0.562288
4   -1.012831  0.314247 -0.908024 -1.412304
..        ...       ...       ...       ...
360  0.662881  1.173474  0.181022 -1.296832
361  0.399688 -0.651357 -0.528617  0.586364
362  1.238283  0.021272  0.308833  1.702215
363  0.240753  2.601683  0.565510 -1.760763
364  0.753342  0.381158  1.289753  0.673181

[365 rows x 4 columns]


## Using REST and JSON

**4. Using this JSON string, parse a JSON string with the loads() function.**
<br>
<br>
'{"country":"Netherlands","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","ips":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'
<br>
 - a) Print the values for the "Country" column.
 - b) Overwrite the value for Netherlands with the value of your choice.
 - c) Print your results.

In [5]:
# Define JSON string
json_str = '{"country":"Netherlands","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'

# Parse string with the loads() function
data = json.loads(json_str)
# Print values for "Country" column
print("Country", data["country"])
# Overwrite value for Netherlands with different country]
data["country"] = "Brazil"
# Display results
print(json.dumps(data))

Country Netherlands
{"country": "Brazil", "dma_code": "0", "timezone": "Europe/Amsterdam", "area_code": "0", "ip": "46.19.37.108", "asn": "AS196752", "continent_code": "EU", "isp": "Tilaa V.O.F.", "longitude": 5.75, "latitude": 52.5, "country_code": "NL", "country_code3": "NLD"}


**5. Using the Pandas read_json() function, we can either create a pandas Series or DataFrame - taking the JSON string from #4, create a series.**
 - a) Change the country value again to your choice and convert the Pandas Series to a JSON string.

In [6]:
# Define JSON string
json_str = '{"country":"Netherlands","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}'

# Use read_json() function to create a series
data = pd.read_json(json_str, typ='series')
print("Pandas Series\n", data)

# Change value previously chosen
data["country"] = "Brazil"
# Convert series to JSON string
print("\nRevised Series\n", data.to_json())

Pandas Series
 country                Netherlands
dma_code                         0
timezone          Europe/Amsterdam
area_code                        0
ip                    46.19.37.108
asn                       AS196752
continent_code                  EU
isp                   Tilaa V.O.F.
longitude                     5.75
latitude                      52.5
country_code                    NL
country_code3                  NLD
dtype: object

Revised Series
 {"country":"Brazil","dma_code":"0","timezone":"Europe\/Amsterdam","area_code":"0","ip":"46.19.37.108","asn":"AS196752","continent_code":"EU","isp":"Tilaa V.O.F.","longitude":5.75,"latitude":52.5,"country_code":"NL","country_code3":"NLD"}


## Parsing HTML with Beautiful Soup

**6. Starting on page 124 - follow along with the BeautifulSoup exercise to scrape data from the HTML page included in the GitHub repo. This exercise is great practice for your Term Project Milestone 4.**

In [7]:
# Define HTML page from repo
soup = BeautifulSoup(open(r'C:\Users\thefli0\Downloads\loremIpsum.html'),"lxml")

print("First div\n", soup.div)
print("First div class", soup.div['class'])

print("First dfn text", soup.dl.dt.dfn.text)

for link in soup.find_all('a'):
   print("Link text", link.string, "URL", link.get('href'))

# Omitting find_all
for i, div in enumerate(soup('div')):
   print(i, div.contents)


#Div with id=official
official_div = soup.find_all("div", id="official")
print("Official Version", official_div[0].contents[2].strip())

print("# elements with class", len(soup.find_all(class_=True)))

tile_class = soup.find_all("div", class_="tile")
print("# Tile classes", len(tile_class))

print("# Divs with class containing tile", len(soup.find_all("div", class_=re.compile("tile"))))

print("Using CSS selector\n", soup.select('div.notile'))
print("Selecting ordered list list items\n", soup.select("ol > li")[:2])
print("Second list item in ordered list", soup.select("ol > li:nth-of-type(2)"))

print("Searching for text string", soup.find_all(text=re.compile("2014")))

First div
 <div class="tile">
<h4>Development</h4>
     0.10.1 - July 2014<br/>
</div>
First div class ['tile']
First dfn text Quare attende, quaeso.
Link text loripsum.net URL http://loripsum.net/
Link text Poterat autem inpune; URL http://loripsum.net/
Link text Is es profecto tu. URL http://loripsum.net/
0 ['\n', <h4>Development</h4>, '\n     0.10.1 - July 2014', <br/>, '\n']
1 ['\n', <h4>Official Release</h4>, '\n     0.10.0 June 2014', <br/>, '\n']
2 ['\n', <h4>Previous Release</h4>, '\n     0.09.1 June 2013', <br/>, '\n']
Official Version 0.10.0 June 2014
# elements with class 3
# Tile classes 2
# Divs with class containing tile 3
Using CSS selector
 [<div class="notile">
<h4>Previous Release</h4>
     0.09.1 June 2013<br/>
</div>]
Selecting ordered list list items
 [<li>Cur id non ita fit?</li>, <li>In qua si nihil est praeter rationem, sit in una virtute finis bonorum;</li>]
Second list item in ordered list [<li>In qua si nihil est praeter rationem, sit in una virtute finis bon