# P3-L0:  数据提取基础

## 练习: 解析 CSV 文件

In [78]:
# Your task is to read the input DATAFILE line by line, and for the first 10 lines (not including the header)
# split each line on "," and then for each line, create a dictionary
# where the key is the header title of the field, and the value is the value of that field in the row.
# The function parse_file should return a list of dictionaries,
# each data line in the file being a single list entry.
# Field names and values should not contain extra whitespace, like spaces or newline characters.
# You can use the Python string method strip() to remove the extra whitespace.
# You have to parse only the first 10 data lines in this exercise,
# so the returned list should have 10 entries!

我的解法

In [79]:
import os
import csv

DATADIR = ""
DATAFILE = "beatles-diskography.csv"


def parse_file(datafile):
    data = []
    with open(datafile, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if len(data) < 10:
                data.append(row)
    return data

讲师解法

In [80]:
import os

DATADIR = ""
DATAFILE = "beatles-diskography.csv"


def parse_file(datafile):
    data = []
    with open(datafile, "r") as f:
        header = f.readline().split(",")
        counter = 0
        for line in f:
            if counter == 10:
                break
            fields = line.split(",")
            entry = {}
            
            for i, value in enumerate(fields):
                entry[header[i].strip()] = value.strip()
            data.append(entry)
            counter += 1
    return data

+ 用 strip() 函数清理字符串中的空格
+ 用枚举 enumerate 取值

In [81]:
def test():
    # a simple test of your implemetation
    datafile = os.path.join(DATADIR, DATAFILE)
    d = parse_file(datafile)
    firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 'BPI Certification': 'Gold'}
    tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}

    assert d[0] == firstline
    assert d[9] == tenthline
    
test()

## 练习: 读取 Excel 文件

In [82]:
#!/usr/bin/env python
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format
"""

import xlrd
from zipfile import ZipFile
datafile = "2013_ERCOT_Hourly_Load_Data.xls"

我的解法

In [83]:
def open_zip(datafile):
    with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ### example on how you can get the data
    #sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]

    ### other useful methods:
    # print "\nROWS, COLUMNS, and CELLS:"
    # print "Number of rows in the sheet:", 
    # print sheet.nrows
    # print "Type of data in cell (row 3, col 2):", 
    # print sheet.cell_type(3, 2)
    # print "Value in cell (row 3, col 2):", 
    # print sheet.cell_value(3, 2)
    # print "Get a slice of values in column 3, from rows 1-3:"
    # print sheet.col_values(3, start_rowx=1, end_rowx=4)

    # print "\nDATES:"
    # print "Type of data in cell (row 1, col 0):", 
    # print sheet.cell_type(1, 0)
    # exceltime = sheet.cell_value(1, 0)
    # print "Time in Excel format:",
    # print exceltime
    # print "Convert time to a Python datetime tuple, from the Excel float:",
    # print xlrd.xldate_as_tuple(exceltime, 0)
    
    time_values = sheet.col_values(1, start_rowx=1)
    maxvalue = max(time_values)
    minvalue = min(time_values)
    average = sum(time_values) / float(len(time_values))
    
    maxvalue_row = time_values.index(maxvalue) + 1
    maxtime = xlrd.xldate_as_tuple(sheet.cell_value(maxvalue_row , 0), 0)
    
    minvalue_row = time_values.index(minvalue) + 1
    mintime = xlrd.xldate_as_tuple(sheet.cell_value(minvalue_row , 0), 0)
    
    # print maxvalue
    # print maxtime
    # print minvalue
    # print mintime
    # print average
    
    data = {
            'maxtime': maxtime,
            'maxvalue': maxvalue,
            'mintime': mintime,
            'minvalue': minvalue,
            'avgcoast': average
    }
    return data

In [84]:
def test():
    open_zip(datafile)
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18779.02551, 10)


test()

## 练习: JSON Playground

In [85]:
# To experiment with this code freely you will have to run this code locally.
# Take a look at the main() function for an example of how to use the code.
# We have provided example json output in the other code editor tabs for you to
# look at, but you will not be able to run any queries through our UI.
import json
import requests


BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    # This is the main function for making queries to the musicbrainz API.
    # A json document should be returned by the query.
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    '''
    Modify the function calls and indexing below to answer the questions on
    the next quiz. HINT: Note how the output we get from the site is a
    multi-level JSON document, so try making print statements to step through
    the structure one level at a time or copy the output to a separate output
    file.
    '''
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    pretty_print(results)

    artist_id = results["artists"][1]["id"]
    print "\nARTIST:"
    pretty_print(results["artists"][1])

    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]
    print "\nONE RELEASE:"
    pretty_print(releases[0], indent=2)
    release_titles = [r["title"] for r in releases]

    print "\nALL TITLES:"
    for t in release_titles:
        print t


# if __name__ == '__main__':
#     main()

- 有多少个名为First Aid Kit的乐队

In [86]:
results = query_by_name(ARTIST_URL, query_type["simple"], "First Aid Kit")
pretty_print(results["count"])

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AFirst+Aid+Kit&fmt=json
440


- 皇后乐队的发源地是哪里？

In [87]:
results = query_by_name(ARTIST_URL, query_type["simple"], "Queen")
i = 0
loop = results["count"]
for i in range(loop):
    try:
        pretty_print(results["artists"][i]["begin-area"]["name"])
    except:
        pass

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AQueen&fmt=json
London
Brooklyn


- 披头士的西班牙别名

In [88]:
results = query_by_name(ARTIST_URL, query_type["simple"], "The Beatles")
i = 0
loop = results["count"]
for i in range(loop):
    try:
        pretty_print(results["artists"][i]["aliases"])
    except:
        pass

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AThe+Beatles&fmt=json
[{u'name': u'\ub354 \ube44\ud2c0\uc988', u'locale': u'ko', u'end-date': None, u'primary': True, u'sort-name': u'\ub354 \ube44\ud2c0\uc988', u'type': None, u'begin-date': None}, {u'name': u'\u30b6\u30fb\u30d3\u30fc\u30c8\u30eb\u30ba', u'locale': u'ja', u'end-date': None, u'primary': True, u'sort-name': u'\u30d3\u30fc\u30c8\u30eb\u30ba (\u30b6)', u'type': None, u'begin-date': None}, {u'name': u'B', u'locale': None, u'end-date': None, u'primary': None, u'sort-name': u'B', u'type': u'Search hint', u'begin-date': None}, {u'name': u'Be', u'locale': None, u'end-date': None, u'primary': None, u'sort-name': u'Be', u'type': u'Search hint', u'begin-date': None}, {u'name': u'Beat', u'locale': None, u'end-date': None, u'primary': None, u'sort-name': u'Beat', u'type': u'Search hint', u'begin-date': None}, {u'name': u'Beatles', u'locale': u'en', u'end-date': None, u'primary': None, u'sort-name': u'Beatles', u'type': u'

- “涅槃”的词义消歧方法

In [103]:
results = query_by_name(ARTIST_URL, query_type["simple"], "Kurt Cobain")
i = 0
loop = results["count"]
for i in range(loop):
    try:
        pretty_print(results["artists"][i]["disambiguation"])
    except:
        pass

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AKurt+Cobain&fmt=json
German post-punk/indie/art-rock band
Japanese band
UK Hardcore
Probably Houston Rapper
US rapper Miguel Blackmer-Hart


- One Direction 乐队是什么时候成立的？

In [102]:
results = query_by_name(ARTIST_URL, query_type["simple"], "One Direction")
i = 0
loop = results["count"]
for i in range(loop):
    try:
        pretty_print(results["artists"][i]["life-span"])
    except:
        pass

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AOne+Direction&fmt=json
{
    "begin": "2010-07", 
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "begin": "2009-08", 
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "begin": "1989", 
    "end": "2008", 
    "ended": true
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "begin": "2010", 
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
{
    "ended": null
}
