## Creating DataFrames from Scratch

In [2]:
import numpy as np 
import pandas as pd 

In [13]:
fname = ["Paul", "John", "Richard", "George"]
lname = ["McCartney", "Lennon", "Starkey", "Harrison"]
birth = [1942, 1940, 1940, 1943]

In [15]:
people = {'first': fname, "last": lname, "birth": birth}

In [17]:
beatles = pd.DataFrame(people)

In [18]:
beatles 

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [19]:
pd.DataFrame([
    

    {
        "first": "Paul",
        "last" : "McCartney",
        "birth": 1942,
    },
    
    {
        "first": "John",
        "last" : "Lennon",
        "birth": 1940,
    },
    
    {
        "first": "Richard",
        "last" : "Starkey",
        "birth": 1940,
    },
    
    {
        "first": "George",
        "last" : "Harrison",
        "birth": 1943,
    },
    
]
    
    
)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [20]:
pd.DataFrame([
    

    {
        "first": "Paul",
        "last" : "McCartney",
        "birth": 1942,
    },
    
    {
        "first": "John",
        "last" : "Lennon",
        "birth": 1940,
    },
    
    {
        "first": "Richard",
        "last" : "Starkey",
        "birth": 1940,
    },
    
    {
        "first": "George",
        "last" : "Harrison",
        "birth": 1943,
    },
    
], 
    columns=["last", "first", "birth"]
    
    
)

Unnamed: 0,last,first,birth
0,McCartney,Paul,1942
1,Lennon,John,1940
2,Starkey,Richard,1940
3,Harrison,George,1943


## Writing CSV

In [26]:
# Write the DataFrame to a CSV file:
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [23]:
from io import StringIO

In [24]:
fout = StringIO()
beatles.to_csv(fout)  # use a filename instead of fout

In [25]:
# look at the file contents:
print(fout.getvalue())

,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943



In [31]:
_ = fout.seek(0)
pd.read_csv(fout)

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,McCartney,1942
1,1,John,Lennon,1940
2,2,Richard,Starkey,1940
3,3,George,Harrison,1943


In [32]:
# The read_csv function has an index_col parameter 
# that you can use to specify the location of the index:
_ = fout.seek(0)
pd.read_csv(fout, index_col=0)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [33]:
_ = fout.seek(1)
pd.read_csv(fout)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [34]:
# Alternatively, if we didn't want to include the 
# index when writing the CSV file, we can set the
# index parameter to False:

fout = StringIO()
beatles.to_csv(fout, index=False)

In [35]:
print(fout.getvalue())

first,last,birth
Paul,McCartney,1942
John,Lennon,1940
Richard,Starkey,1940
George,Harrison,1943



## Reading Large CSV files

In [41]:
pd.set_option("max_columns", 8, "max_rows", 10)

In [50]:
# Determine how much memory the whole file will take
# up. We will use the nrows parameter of read_csv
# to limit how much data we load to a small sample
diamonds = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/diamonds.csv', nrows=1000 )

In [51]:
diamonds


Unnamed: 0,carat,cut,color,clarity,...,price,x,y,z
0,0.23,Ideal,E,SI2,...,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,...,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,...,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,...,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,...,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
995,0.54,Ideal,D,VVS2,...,2897,5.30,5.34,3.26
996,0.72,Ideal,E,SI1,...,2897,5.69,5.74,3.57
997,0.72,Good,F,VS1,...,2897,5.82,5.89,3.48
998,0.74,Premium,D,VS2,...,2897,5.81,5.77,3.58


In [52]:
# Use the .info method to see how much memory the 
# sample of data uses:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 78.2+ KB


In [48]:
# Use the dtype parameter to read_csv to tell it to
# use the correct (or smaller) numeric types:
diamonds2 = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/diamonds.csv', nrows=1000, dtype={
    "carat": np.float32,
    "depth": np.float32,
    "table": np.float32,
    "x": np.float32,
    "y": np.float32,
    "z": np.float32,
    "price": np.int16,
} )

In [49]:
diamonds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float32
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float32
 5   table    1000 non-null   float32
 6   price    1000 non-null   int16  
 7   x        1000 non-null   float32
 8   y        1000 non-null   float32
 9   z        1000 non-null   float32
dtypes: float32(6), int16(1), object(3)
memory usage: 49.0+ KB


In [53]:
# Make sure that summary statistics are similar
# with our new dataset to the original
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.68928,61.7228,57.7347,2476.54,5.60594,5.59918,3.45753
std,0.195291,1.758879,2.467946,839.57562,0.625173,0.611974,0.389819
min,0.2,53.0,52.0,326.0,3.79,3.75,2.27
25%,0.7,60.9,56.0,2777.0,5.64,5.63,3.45
50%,0.71,61.8,57.0,2818.0,5.77,5.76,3.55
75%,0.79,62.6,59.0,2856.0,5.92,5.91,3.64
max,1.27,69.5,70.0,2898.0,7.12,7.05,4.33


In [54]:
diamonds2.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.689281,61.722824,57.734699,2476.54,5.605941,5.59918,3.457533
std,0.195291,1.758878,2.467944,839.57562,0.625173,0.611972,0.389819
min,0.2,53.0,52.0,326.0,3.79,3.75,2.27
25%,0.7,60.900002,56.0,2777.0,5.64,5.63,3.45
50%,0.71,61.799999,57.0,2818.0,5.77,5.76,3.55
75%,0.79,62.599998,59.0,2856.0,5.92,5.91,3.64
max,1.27,69.5,70.0,2898.0,7.12,7.05,4.33


In [55]:
# Use the dtype parameter to use change object types
# to categoricals. First, inspect the .value_counts
# method of the object columns. If they are low 
# cardinality, you can convert them to categorical
# columns to save even more memory
diamonds2.cut.value_counts()

Ideal        333
Premium      290
Very Good    226
Good          89
Fair          62
Name: cut, dtype: int64

In [56]:
diamonds2.color.value_counts()

E    240
F    226
G    139
D    129
H    125
I     95
J     46
Name: color, dtype: int64

In [57]:
diamonds2.clarity.value_counts()

SI1     306
VS2     218
VS1     159
SI2     154
VVS2     62
VVS1     58
I1       29
IF       14
Name: clarity, dtype: int64

In [58]:
# Because these are of low cardinality, we can
# convert them to categoricals and use
# around 37% of the original size
diamonds3 = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/diamonds.csv', nrows=1000, dtype={
    "carat": np.float32,
    "depth": np.float32,
    "table": np.float32,
    "x": np.float32,
    "y": np.float32,
    "z": np.float32,
    "price": np.int16,
    "cut": "category",
    "color": "category",
    "clarity": "category",
},
) 

In [59]:
diamonds3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
 7   x        1000 non-null   float32 
 8   y        1000 non-null   float32 
 9   z        1000 non-null   float32 
dtypes: category(3), float32(6), int16(1)
memory usage: 29.4 KB


In [60]:
# If there are columns that we know we can ignore,
# we can use the usecols parameter to specify the 
# columns we want to load. Here, we will ignore 
# columns x, y, and z:
cols = [
    
    "carat",
    "cut",
    "color",
    "clarity",
    "depth",
    "table",
    "price",
    
]

In [61]:
diamonds4 = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/diamonds.csv', nrows=1000, dtype={
    "carat": np.float32,
    "depth": np.float32,
    "table": np.float32,
    "x": np.float32,
    "y": np.float32,
    "z": np.float32,
    "price": np.int16,
    "cut": "category",
    "color": "category",
    "clarity": "category"},
                       usecols=cols,
                       )

In [62]:
diamonds4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
dtypes: category(3), float32(3), int16(1)
memory usage: 17.6 KB


In [63]:
# If the preceding steps are not sufficient to
# create a small enough DataFrame, you might still 
# be in luck. If you can process chunks of the data
# at a time and do not need all of it in memory,
# you can use the chunksize parameter

In [64]:
cols = [
    
    "carat",
    "cut",
    "color",
    "clarity",
    "depth",
    "table",
    "price",
    
]

In [65]:
diamonds_iter =  pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/diamonds.csv', nrows=1000, dtype={
    "carat": np.float32,
    "depth": np.float32,
    "table": np.float32,
    "x": np.float32,
    "y": np.float32,
    "z": np.float32,
    "price": np.int16,
    "cut": "category",
    "color": "category",
    "clarity": "category"},
                       usecols=cols,
                             chunksize=200,
                       )

In [66]:
def process(df):
    return f"processed {df.size} items"

for chunk in diamonds_iter:
    process(chunk)

In [71]:
# If we use int8 for the price, we will lose
# information. You can use the NumPy iinfo function
# to list limits for NumPy integer types:

np.iinfo(np.int8)

iinfo(min=-128, max=127, dtype=int8)

In [72]:
# you can use the finfo function for information 
# about floating-point numbers:
np.finfo(np.float16)

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [73]:
# You can also ask a DataFrame or Series how many
# bytes it is using with the .memory_usage method. 
# Note that this also includes the memory
# requirements of the index. Also, you need to pass
# deep=True to get the usage of Series with 
# object types:
diamonds.price.memory_usage()

8128

In [74]:
diamonds.price.memory_usage(index=False)

8000

In [75]:
diamonds.cut.memory_usage()

8128

In [76]:
diamonds.cut.memory_usage(deep=True)

63461

In [78]:
# Once you have your data in a format you like, you
# can save it in a binary format that tracks
# types, such as the Feather format 
# (pandas leverages the pyarrow library to do this). This
# format is meant to enable in-memory transfer of 
# structured data between languages and
# optimized so that data can be used as is without 
# internal conversion. Reading from this format is
# much quicker and easy once you have the types
# defined:
diamonds4.to_feather("d.arr")

In [79]:
diamonds5 = pd.read_feather("d.arr")

In [81]:
# Another binary option is the Parquet format.
# Whereas Feather optimizes the binary data for
# the in-memory structure, Parquet optimizes for 
# the on-disk format. Parquet is used by many
# big data products. The pandas library has
# support for Parquet as well.

diamonds4.to_parquet("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/d.pqt")

In [82]:
#Right now there is some conversion required for
# pandas to load data from both Parquet and
# Feather. But both are quicker than CSV and
# persist types.

## Use Excel files

In [84]:
# Create an Excel file using the .to_excel method.
# You can write either xls files or xlsx files:

beatles.to_excel("beat.xls")

  beatles.to_excel("beat.xls")


In [85]:
beatles.to_excel("beat.xlsx")

In [87]:
# Read the Excel with the read_excel function:
beat2 = pd.read_excel("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Jupyter Notebook codes/beat.xls")

In [88]:
beat2

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,McCartney,1942
1,1,John,Lennon,1940
2,2,Richard,Starkey,1940
3,3,George,Harrison,1943


In [89]:
# Because this file had an index column included,
# you can specify that with the index_col parameter:
beat2 = pd.read_excel("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Jupyter Notebook codes/beat.xls", index_col=0)

In [90]:
beat2

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [91]:
# inspect data types of the file to check that
# Excel preserved the types:
beat2.dtypes

first    object
last     object
birth     int64
dtype: object

In [92]:
# We can use pandas to write to a sheet of a
# spreadsheet. You can pass a sheet_name
# parameter to the .to_excel method to tell it the name of the sheet to create:
xl_writer = pd.ExcelWriter("beat.xlsx")

In [93]:
beatles.to_excel(xl_writer, sheet_name="All")

In [94]:
beatles[beatles.birth < 1941].to_excel(
    xl_writer, sheet_name="1940"
)

In [95]:
xl_writer.save()
# This file will have two sheets, one labeled All
# that has the whole DataFrame, and another
# labeled 1940 that is filtered to births before 
# 1941.

## Working with ZIP files 

In [7]:
pd.set_option("max_columns", 5, "max_rows", 10)

In [8]:
# If the CSV file is the only file in the ZIP file,
# you can just call the read_csv function on it:

auto = pd.read_csv("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/vehicles.csv.zip")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
auto

Unnamed: 0,barrels08,barrelsA08,...,phevHwy,phevComb
0,14.167143,0.0,...,0,0
1,27.046364,0.0,...,0,0
2,11.018889,0.0,...,0,0
3,27.046364,0.0,...,0,0
4,15.658421,0.0,...,0,0
...,...,...,...,...,...
44755,13.523182,0.0,...,0,0
44756,12.935217,0.0,...,0,0
44757,14.167143,0.0,...,0,0
44758,14.167143,0.0,...,0,0


In [11]:
auto.modifiedOn.dtype

dtype('O')

In [12]:
auto.modifiedOn

0        Tue Jan 01 00:00:00 EST 2013
1        Tue Jan 01 00:00:00 EST 2013
2        Tue Jan 01 00:00:00 EST 2013
3        Tue Jan 01 00:00:00 EST 2013
4        Tue Jan 01 00:00:00 EST 2013
                     ...             
44755    Tue Jan 01 00:00:00 EST 2013
44756    Tue Jan 01 00:00:00 EST 2013
44757    Tue Jan 01 00:00:00 EST 2013
44758    Tue Jan 01 00:00:00 EST 2013
44759    Tue Jan 01 00:00:00 EST 2013
Name: modifiedOn, Length: 44760, dtype: object

In [13]:
pd.to_datetime(auto.modifiedOn)



0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
44755   2013-01-01
44756   2013-01-01
44757   2013-01-01
44758   2013-01-01
44759   2013-01-01
Name: modifiedOn, Length: 44760, dtype: datetime64[ns]

In [14]:
autos = pd.read_csv("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/vehicles.csv.zip", parse_dates=["modifiedOn"])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
autos.modifiedOn

0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
44755   2013-01-01
44756   2013-01-01
44757   2013-01-01
44758   2013-01-01
44759   2013-01-01
Name: modifiedOn, Length: 44760, dtype: datetime64[ns]

If the ZIP file has many files it in, reading a CSV file from it is a little more involved.
The read_csv function does not have the ability to specify a file inside a ZIP file.
Instead, we will use the zipfile module from the Python standard library.

In [16]:
import zipfile

with zipfile.ZipFile(
    "C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/kaggle_survey_2020_responses.csv.zip"
) as z:
    print("\n".join(z.namelist()))
    kag = pd.read_csv(
        z.open("multipleChoiceResponses.csv")
    )
    kag_questions = kag.iloc[0]
    survey = kag.ilococ[1]

multipleChoiceResponses.csv
freeFormResponses.csv
SurveySchema.csv

kaggle_survey_2020_responses.csv


KeyError: "There is no item named 'multipleChoiceResponses.csv' in the archive"

## Working with Databases

In [17]:
import sqlite3

In [18]:
con = sqlite3.connect("C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/beat.db")

In [19]:
with con:
    cur = con.cursor()
    cur.execute("""DROP TABLE Band""")
    cur.execute(
        """CREATE TABLE Band(id INTEGER PRIMARY KEY,
        fname TEXT, lname TEXT, birthyear INT)"""
    )
    cur.execute(
        """INSERT INTO Band VALUES(
        0, 'Paul', 'McCartney', 1942
        )"""
    )
    cur.execute(
        """INSERT INTO Band VALUES(
        1, 'John', 'Lennon', 1940
        )"""
    )
    
    _ = con.commit()

In [26]:
# Read the table from the database into a DataFrame.
# Note that if we are reading a table, we need to
# use a SQLAlchemy connection. SQLAlchemy is a
# library that abstracts databases for us:
import sqlalchemy as sa

engine = sa.create_engine(
    "sqlite:///C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/beat.db", echo=True
)

In [27]:
sa_connection = engine.connect()

In [28]:
beat = pd.read_sql(
    "Band", sa_connection, index_col='id'
)

2022-03-06 08:07:22,516 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Band")
2022-03-06 08:07:22,519 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-06 08:07:22,522 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2022-03-06 08:07:22,523 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-06 08:07:22,527 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("Band")
2022-03-06 08:07:22,528 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-06 08:07:22,533 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2022-03-06 08:07:22,533 INFO sqlalchemy.engine.Engine [raw sql] ('Band',)
2022-03-06 08:07:22,537 INFO sqlalchemy.engine.Engine PRAGMA main.foreign_key_list("Band")
2022-03-06 08:07:22,538 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-03-06 08:07:22,540 INFO sqlalchemy.engine.Engine PRAGMA temp.foreign_key_list("Band")
2022-03

In [29]:
beat

Unnamed: 0_level_0,fname,lname,birthyear
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Paul,McCartney,1942
1,John,Lennon,1940


## Reading JSON

In [39]:
people = {"first": ["Paul", "John", "Richard", "George"], "last": ["McCartney",
"Lennon", "Starkey", "Harrison"], "birth": [1942, 1940, 1940, 1943]}

In [30]:
import json

In [40]:
encoded = json.dumps(people)
encoded

'{"first": ["Paul", "John", "Richard", "George"], "last": ["McCartney", "Lennon", "Starkey", "Harrison"], "birth": [1942, 1940, 1940, 1943]}'

In [41]:
json.loads(encoded)

{'first': ['Paul', 'John', 'Richard', 'George'],
 'last': ['McCartney', 'Lennon', 'Starkey', 'Harrison'],
 'birth': [1942, 1940, 1940, 1943]}

In [42]:
beatles = pd.read_json(encoded)

In [43]:
beatles 

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [44]:
# Following are examples of these styles. The columns
# style was the example shown previously:

records = beatles.to_json(orient="records")

In [45]:
records

'[{"first":"Paul","last":"McCartney","birth":1942},{"first":"John","last":"Lennon","birth":1940},{"first":"Richard","last":"Starkey","birth":1940},{"first":"George","last":"Harrison","birth":1943}]'

In [48]:
pd.read_json(records, orient="records")

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [49]:
split = beatles.to_json(orient="split")

In [50]:
split

'{"columns":["first","last","birth"],"index":[0,1,2,3],"data":[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]}'

In [52]:
pd.read_json(split, orient="split")

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [53]:
index = beatles.to_json(orient="index")

In [54]:
index

'{"0":{"first":"Paul","last":"McCartney","birth":1942},"1":{"first":"John","last":"Lennon","birth":1940},"2":{"first":"Richard","last":"Starkey","birth":1940},"3":{"first":"George","last":"Harrison","birth":1943}}'

In [56]:
pd.read_json(index, orient="index")

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [57]:
values = beatles.to_json(orient="values")

In [58]:
values

'[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]'

In [59]:
values

'[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]'

In [60]:
pd.read_json(values, orient="values")

Unnamed: 0,0,1,2
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [63]:
pd.read_json(values, orient="values").rename(columns=dict(enumerate(["first", "last", "birth"])))

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [64]:
table = beatles.to_json(orient="table")

In [65]:
table

'{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"first","type":"string"},{"name":"last","type":"string"},{"name":"birth","type":"integer"}],"primaryKey":["index"],"pandas_version":"0.20.0"},"data":[{"index":0,"first":"Paul","last":"McCartney","birth":1942},{"index":1,"first":"John","last":"Lennon","birth":1940},{"index":2,"first":"Richard","last":"Starkey","birth":1940},{"index":3,"first":"George","last":"Harrison","birth":1943}]}'

In [66]:
# If you are working on a web service and need to add
# additional data to the JSON, just use the .to_dict
# method to generate dictionaries. You can add your
# new data to the dictionary, and then convert that
# dictionary to JSON:

output = beat.to_dict()

In [67]:
output

{'fname': {0: 'Paul', 1: 'John'},
 'lname': {0: 'McCartney', 1: 'Lennon'},
 'birthyear': {0: 1942, 1: 1940}}

In [69]:
output["version"] = "0.4.1"

In [70]:
json.dumps(output)

'{"fname": {"0": "Paul", "1": "John"}, "lname": {"0": "McCartney", "1": "Lennon"}, "birthyear": {"0": 1942, "1": 1940}, "version": "0.4.1"}'

## Reading HTML tables

In [73]:
# You can use pandas to read HTML tables from websites. 
# This makes it easy to ingest tables such as those 
# found on Wikipedia or other websites.
# Use the read_html function to load all of the tables
# from https://en.wikipedia.org/wiki/The_Beatles_discography:

url = "https://en.wikipedia.org/wiki/The_Beatles_discography"

dfs = pd.read_html(url)

In [74]:
dfs[0]

Unnamed: 0,The Beatles discography,The Beatles discography.1
0,The Beatles in 1965,The Beatles in 1965
1,Studio albums,"13 (core catalogue), 21 (worldwide)"
2,Live albums,6
3,Compilation albums,54
4,Video albums,22
5,Music videos,68
6,EPs,36
7,Singles,63
8,Mash-ups,2
9,Box sets,17


In [75]:
len(dfs)

58

In [76]:
url = "https://en.wikipedia.org/wiki/The_Beatles_discography"

dfs = pd.read_html(
    url, match="List of studio albums", na_values="-"
)

In [77]:
len(dfs)

1

In [83]:
dfs[0].columns

MultiIndex([(               'Title',          'Title'),
            (       'Album details',  'Album details'),
            ('Peak chart positions',       'UK[6][7]'),
            ('Peak chart positions',         'AUS[8]'),
            ('Peak chart positions',         'CAN[9]'),
            ('Peak chart positions',        'FRA[10]'),
            ('Peak chart positions',        'GER[11]'),
            ('Peak chart positions',        'NOR[12]'),
            ('Peak chart positions',     'US[13][14]'),
            (      'Certifications', 'Certifications')],
           )

In [85]:
url = "https://en.wikipedia.org/wiki/The_Beatles_discography"

dfs = pd.read_html(
    url, match="List of studio albums", na_values="-", header=[0, 1],
)

In [86]:
len(dfs)

1

In [88]:
dfs[0]

Unnamed: 0_level_0,Title,Album details,...,Peak chart positions,Certifications
Unnamed: 0_level_1,Title,Album details,...,US[13][14],Certifications
0,Please Please Me,Released: 22 March 1963 Label: Parlophone (UK),...,—,BPI: Platinum[15] ARIA: Gold[16] MC: Gold[17] ...
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,...,—,BPI: Gold[15] ARIA: Gold[16] BVMI: Gold[19] MC...
2,Introducing... The Beatles,Released: 10 January 1964 Label: Vee-Jay (US),...,2,RIAA: Platinum[18]
3,Meet the Beatles!,Released: 20 January 1964 Label: Capitol (US),...,1,MC: Platinum[17] RIAA: 5× Platinum[18]
4,Twist and Shout,Released: 3 February 1964 Label: Capitol (CAN),...,—,MC: 3× Platinum[17]
...,...,...,...,...,...
22,"The Beatles (""The White Album"")",Released: 22 November 1968 Label: Apple,...,1,BPI: 2× Platinum[15] ARIA: 2× Platinum[16] MC:...
23,Yellow Submarine[D],"Released: 13 January 1969 Label: Apple (UK), C...",...,2,BPI: Gold[15] MC: Gold[17] RIAA: Platinum[18]
24,Abbey Road,Released: 26 September 1969 Label: Apple,...,1,BPI: 8× Platinum[15] ARIA: 3× Platinum[16] BVM...
25,Let It Be,Released: 8 May 1970 Label: Apple,...,1,BPI: Platinum[15] ARIA: Platinum[16] MC: 3× Pl...


In [90]:
dfs[0].columns

MultiIndex([(               'Title',          'Title'),
            (       'Album details',  'Album details'),
            ('Peak chart positions',       'UK[6][7]'),
            ('Peak chart positions',         'AUS[8]'),
            ('Peak chart positions',         'CAN[9]'),
            ('Peak chart positions',        'FRA[10]'),
            ('Peak chart positions',        'GER[11]'),
            ('Peak chart positions',        'NOR[12]'),
            ('Peak chart positions',     'US[13][14]'),
            (      'Certifications', 'Certifications')],
           )

In [91]:
# This is not something that is easy to fix 
# programmatically. In this case, the easiest
# solution is to update the columns manually

df = dfs[0]

In [92]:
df.columns = [
    "Title",
     "Release",
     "UK",
     "AUS",
     "CAN",
     "FRA",
     "GER",
     "NOR",
     "US",
     "Certifications",
]

In [93]:
df

Unnamed: 0,Title,Release,...,US,Certifications
0,Please Please Me,Released: 22 March 1963 Label: Parlophone (UK),...,—,BPI: Platinum[15] ARIA: Gold[16] MC: Gold[17] ...
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,...,—,BPI: Gold[15] ARIA: Gold[16] BVMI: Gold[19] MC...
2,Introducing... The Beatles,Released: 10 January 1964 Label: Vee-Jay (US),...,2,RIAA: Platinum[18]
3,Meet the Beatles!,Released: 20 January 1964 Label: Capitol (US),...,1,MC: Platinum[17] RIAA: 5× Platinum[18]
4,Twist and Shout,Released: 3 February 1964 Label: Capitol (CAN),...,—,MC: 3× Platinum[17]
...,...,...,...,...,...
22,"The Beatles (""The White Album"")",Released: 22 November 1968 Label: Apple,...,1,BPI: 2× Platinum[15] ARIA: 2× Platinum[16] MC:...
23,Yellow Submarine[D],"Released: 13 January 1969 Label: Apple (UK), C...",...,2,BPI: Gold[15] MC: Gold[17] RIAA: Platinum[18]
24,Abbey Road,Released: 26 September 1969 Label: Apple,...,1,BPI: 8× Platinum[15] ARIA: 3× Platinum[16] BVM...
25,Let It Be,Released: 8 May 1970 Label: Apple,...,1,BPI: Platinum[15] ARIA: Platinum[16] MC: 3× Pl...


In [98]:
# We will skip these rows. They confuse pandas, and
# the data pandas puts in these rows is not correct.
# We will split the release column into two columns,
# release_date and label:
res = (
    df.pipe(
        lambda df_: df_[
            ~df_.Title.str.startswith("Released")
        ]
    )
    .assign(
        release_date=lambda df_: pd.to_datetime(
          df_.Release.str.extract(
            r"Released: (.*) Label"
          )[0].str.replace(r"\[E\]", "")
      ),
      label=lambda df_: df_.Release.str.extract(
        r"Label: (.*)"
      ),
    )
    .loc[
       :,
        [
           "Title",
           "UK",
           "AUS",
           "CAN",
           "FRA",
           "GER",
           "NOR",
           "US",
           "release_date",
           "label",
         ],
     ]
)

  df_.Release.str.extract(


In [99]:
res

Unnamed: 0,Title,UK,...,release_date,label
0,Please Please Me,1,...,1963-03-22,Parlophone (UK)
1,With the Beatles[B],1,...,1963-11-22,"Parlophone (UK), Capitol (CAN), Odeon (FRA)"
2,Introducing... The Beatles,—,...,1964-01-10,Vee-Jay (US)
3,Meet the Beatles!,—,...,1964-01-20,Capitol (US)
4,Twist and Shout,—,...,1964-02-03,Capitol (CAN)
...,...,...,...,...,...
22,"The Beatles (""The White Album"")",1,...,1968-11-22,Apple
23,Yellow Submarine[D],3,...,1969-01-13,"Apple (UK), Capitol (US)"
24,Abbey Road,1,...,1969-09-26,Apple
25,Let It Be,1,...,1970-05-08,Apple
