### Pandas
* data wrangling
* groupby
* extension of numpys to add a dataframe capability
* similar to R dataframes
* integrates with numpy
* https://pandas.pydata.org/

In [1]:
import pandas as pd
import numpy as np

## Series
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html
* One-dimensional ndarray with axis labels (including time series)

In [3]:
vals = [1,2,3,4,5]
idxs = ["a","b","c","d","e"]

my_series = pd.Series(vals, idxs)

In [7]:
print(my_series)

print(my_series[["a", "b"]])

a    1
b    2
c    3
d    4
e    5
dtype: int64
a    1
b    2
dtype: int64


### Dataframe
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
* core object in pandas
* 2 dimensional
* index and columns

In [8]:
array1 = np.array([
    [5,6,2,5,3,2],
    [4,6,1,3,4,5],
    [2,3,2,2,1,3]
])
print(array1.shape)

(3, 6)


In [11]:
array1 = np.array([
    [5,6,2,5,3,2],
    [4,6,1,3,4,5],
    [2,3,2,2,1,3]
])
df = pd.DataFrame(array1)
df.shape

(3, 6)

In [10]:
dir(df)

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 

In [14]:
df.head(3)

Unnamed: 0,0,1,2,3,4,5
0,5,6,2,5,3,2
1,4,6,1,3,4,5
2,2,3,2,2,1,3


In [15]:
df

Unnamed: 0,0,1,2,3,4,5
0,5,6,2,5,3,2
1,4,6,1,3,4,5
2,2,3,2,2,1,3


In [16]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [17]:
df.columns

RangeIndex(start=0, stop=6, step=1)

#### change headers

In [18]:
df.columns = ["R", "A", "F", "4", "Echo", "Lima"]

In [19]:
df

Unnamed: 0,R,A,F,4,Echo,Lima
0,5,6,2,5,3,2
1,4,6,1,3,4,5
2,2,3,2,2,1,3


In [27]:
for k in range(len(df.columns)):
    print(df.columns[k])

R
A
F
4
Echo
Lima


In [28]:
df.dtypes

R       int32
A       int32
F       int32
4       int32
Echo    int32
Lima    int32
dtype: object

#### accessing columns
* returns series
* index still attached to the series

In [30]:
df["R"]

0    5
1    4
2    2
Name: R, dtype: int32

In [31]:
type(df["R"])

pandas.core.series.Series

In [33]:
df["R"].tolist()

list

In [38]:
df["R"].index

RangeIndex(start=0, stop=3, step=1)

In [40]:
df[["R", "Echo"]]

Unnamed: 0,R,Echo
0,5,3
1,4,4
2,2,1


#### index subsetting
* iloc[] is position selecting
* iloc[rows, columns]
* rows/columns can be a list of indicies (integers) or use : to seperate a range
* [2:] means give me 2 and everything after for slicing
* [1] means give me index 1
* [1,2,3] means give me index 1,2,3
* [1],[0,1] means give me row 1 and columns 0 and 1

In [41]:
df.iloc[0]

R       5
A       6
F       2
4       5
Echo    3
Lima    2
Name: 0, dtype: int32

In [45]:
# need integer based subsetting
df.iloc[0,[0,1,2]]

R    5
A    6
F    2
Name: 0, dtype: int32

In [47]:
df.iloc[[0,2]]

Unnamed: 0,R,A,F,4,Echo,Lima
0,5,6,2,5,3,2
2,2,3,2,2,1,3


In [49]:
df.iloc[[2,0,1]]

Unnamed: 0,R,A,F,4,Echo,Lima
2,2,3,2,2,1,3
0,5,6,2,5,3,2
1,4,6,1,3,4,5


In [50]:
df.iloc[0:,0:]

Unnamed: 0,R,A,F,4,Echo,Lima
0,5,6,2,5,3,2
1,4,6,1,3,4,5
2,2,3,2,2,1,3


In [53]:
df.iloc[0:,0:1]

Unnamed: 0,R
0,5
1,4
2,2


In [54]:
df.iloc[[1],[0,1]]

Unnamed: 0,R,A
1,4,6


#### loc
* used if we have labels and not integer indec

In [56]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [57]:
df.index = ["aa", "bb", "cc"]

In [58]:
df

Unnamed: 0,R,A,F,4,Echo,Lima
aa,5,6,2,5,3,2
bb,4,6,1,3,4,5
cc,2,3,2,2,1,3


In [35]:
df.loc["aa"]

R    5
b    6
c    2
d    5
e    3
f    2
Name: aa, dtype: int32

In [59]:
df.loc[["aa", "bb"]]

Unnamed: 0,R,A,F,4,Echo,Lima
aa,5,6,2,5,3,2
bb,4,6,1,3,4,5


In [61]:
df.loc[["aa"], ["R","Echo"]]

Unnamed: 0,R,Echo
aa,5,3


<h3 style="color:blue">how might we</h3>
<p style="color:blue">- filter if we have a large number of indices?</p>
<p style="color:blue">- we don't want to type out a list of 1000s of index labels</p>

In [64]:
rows = ["aa", "bb"]
cols = ["R", "Echo"]

df.loc[rows,cols]

Unnamed: 0,R,Echo
aa,5,3
bb,4,4


#### subset for columns

In [65]:
df1 = df[[ "R", "4", "Echo"]]

In [66]:
df1

Unnamed: 0,R,4,Echo
aa,5,5,3
bb,4,3,4
cc,2,2,1


#### describe

In [67]:
df.describe()

Unnamed: 0,R,A,F,4,Echo,Lima
count,3.0,3.0,3.0,3.0,3.0,3.0
mean,3.666667,5.0,1.666667,3.333333,2.666667,3.333333
std,1.527525,1.732051,0.57735,1.527525,1.527525,1.527525
min,2.0,3.0,1.0,2.0,1.0,2.0
25%,3.0,4.5,1.5,2.5,2.0,2.5
50%,4.0,6.0,2.0,3.0,3.0,3.0
75%,4.5,6.0,2.0,4.0,3.5,4.0
max,5.0,6.0,2.0,5.0,4.0,5.0


#### correlate

In [74]:
df1.corr()

Unnamed: 0,R,4,Echo
R,1.0,0.928571,0.785714
4,0.928571,1.0,0.5
Echo,0.785714,0.5,1.0


#### summations

In [76]:
df1.sum(axis = 0)

R       11
4       10
Echo     8
dtype: int64

In [77]:
a = df1.sum(axis = 1)
print(type(a))

<class 'pandas.core.series.Series'>


#### median

In [78]:
df1.median(axis = 0)

R       4.0
4       3.0
Echo    3.0
dtype: float64

#### new column

In [83]:
df1["new_class"] = ["AA", "AA",""]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [84]:
df1

Unnamed: 0,R,4,Echo,class,new_class
aa,5,5,3,AA,AA
bb,4,3,4,AA,AA
cc,2,2,1,BB,


In [92]:
df1.groupby("class").sum()

Unnamed: 0_level_0,R,4,Echo
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA,9,8,7
BB,2,2,1


In [94]:
df1.groupby("class").count()

Unnamed: 0_level_0,R,4,Echo,new_class
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,2,2,2,2
BB,1,1,1,1


In [95]:
# count distinct
df1.groupby("class").nunique()

Unnamed: 0_level_0,R,4,Echo,class,new_class
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,2,2,2,1,1
BB,1,1,1,1,1


#### reset index

In [96]:
df1.groupby("class").count().index

Index(['AA', 'BB'], dtype='object', name='class')

In [97]:
df1.groupby("class").count().reset_index()

Unnamed: 0,class,R,4,Echo,new_class
0,AA,2,2,2,2
1,BB,1,1,1,1


In [98]:
df1.groupby("class").count().reset_index().index

RangeIndex(start=0, stop=2, step=1)

#### reading in csv

In [99]:
path = "../data/iris.csv"
df = pd.read_csv(path, sep = ",")

In [102]:
df.head(5)


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [108]:
df.dtypes

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

<h3 style="color:blue">read in the iris csv:</h3>
<p style="color:blue">- find the descriptive statistics</p>
<p style="color:blue">- create a correlation matrix </p>

In [110]:
df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [111]:
df.corr()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
sepal.length,1.0,-0.11757,0.871754,0.817941
sepal.width,-0.11757,1.0,-0.42844,-0.366126
petal.length,0.871754,-0.42844,1.0,0.962865
petal.width,0.817941,-0.366126,0.962865,1.0


#### subset on rules

In [113]:
df1 = df[df["sepal.length"] > 5]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [119]:
df1 = df[(df["sepal.length"] > 6) & (df["sepal.width"] > 3)]
df1.tail()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
140,6.7,3.1,5.6,2.4,Virginica
141,6.9,3.1,5.1,2.3,Virginica
143,6.8,3.2,5.9,2.3,Virginica
144,6.7,3.3,5.7,2.5,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [120]:
df1 = df[df["variety"] == "Setosa"]
df1.shape

(50, 5)

#### isin()

In [121]:
df["variety"].unique()

array(['Setosa', 'Versicolor', 'Virginica'], dtype=object)

In [60]:
lst = ["Setosa", "Versicolor"]

df1 = df[df["variety"].isin(lst)]
df1.shape

(100, 5)

#### like

In [61]:
df1 = df[df["variety"].str.contains("Versi")]
df1.shape

(50, 5)

#### apply

In [122]:
def my_func(x):
    return -x

In [123]:
df.head(1)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa


In [129]:
df["sepal.length"] = df["sepal.length"].apply(my_func)

In [130]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [131]:
df.columns

Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width',
       'variety'],
      dtype='object')

In [133]:
cols = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']

In [134]:
for col in cols:
    df[col] = df[col].apply(my_func)

In [135]:
df.head(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,-5.1,-3.5,-1.4,-0.2,Setosa
1,-4.9,-3.0,-1.4,-0.2,Setosa
2,-4.7,-3.2,-1.3,-0.2,Setosa
3,-4.6,-3.1,-1.5,-0.2,Setosa
4,-5.0,-3.6,-1.4,-0.2,Setosa


In [136]:
df["sepal.length"] = df["sepal.length"].apply(lambda x: -x)

In [137]:
df.head(5)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,-3.5,-1.4,-0.2,Setosa
1,4.9,-3.0,-1.4,-0.2,Setosa
2,4.7,-3.2,-1.3,-0.2,Setosa
3,4.6,-3.1,-1.5,-0.2,Setosa
4,5.0,-3.6,-1.4,-0.2,Setosa


#### datatypes

In [138]:
df.dtypes

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

In [141]:
df["sepal.length"] = df["sepal.length"].apply(float)

In [142]:
df.dtypes

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object

#### get dummies
* for string or object variables
* can specify specific columns
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html

<h3 style="color:blue">what might we need dummies for?</h3>

In [145]:
df = pd.get_dummies(df)

In [147]:
df.tail()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety_Setosa,variety_Versicolor,variety_Virginica
145,6.7,-3.0,-5.2,-2.3,0,0,1
146,6.3,-2.5,-5.0,-1.9,0,0,1
147,6.5,-3.0,-5.2,-2.0,0,0,1
148,6.2,-3.4,-5.4,-2.3,0,0,1
149,5.9,-3.0,-5.1,-1.8,0,0,1


In [150]:
col_1 = np.array(["A", "B", "C"])
col_2 = np.array(["AA", "BB", "CC"])

df = pd.DataFrame({
    "col_1":col_1,
    "col_2":col_2
})

df

Unnamed: 0,col_1,col_2
0,A,AA
1,B,BB
2,C,CC


<h3 style="color:blue">perform get dummies on col_1 only?</h3>

In [161]:
pd.get_dummies(df,columns=['col_1'])

Unnamed: 0,col_2,col_1_A,col_1_B,col_1_C
0,AA,1,0,0
1,BB,0,1,0
2,CC,0,0,1


#### iterate

In [162]:
for idx, r in df.head(5).iterrows():
    print(idx, r)

0 col_1     A
col_2    AA
Name: 0, dtype: object
1 col_1     B
col_2    BB
Name: 1, dtype: object
2 col_1     C
col_2    CC
Name: 2, dtype: object


In [165]:
for idx, r in df.tail(5).iterrows():
    print(r)

col_1     A
col_2    AA
Name: 0, dtype: object
col_1     B
col_2    BB
Name: 1, dtype: object
col_1     C
col_2    CC
Name: 2, dtype: object


#### tolist()

In [169]:
df["col_1"].tolist()

['A', 'B', 'C']
[0    A
1    B
2    C
Name: col_1, dtype: object]


<h3 style="color:blue">Using iris, group by variety and find the count of the other columns?</h3>
<h3 style="color:blue">Using iris, group by variety and find the sum of the other columns?</h3>

In [171]:
col_1 = np.array(["A", "B", "C", "D", "E"])
col_2 = np.array(["A", "B", "C"])

df = pd.DataFrame({
    "col_1":col_1,
    "col_1_ind":1
})

df1 = pd.DataFrame({
    "col_1":col_2,
    "col_2_ind":1
})
df

Unnamed: 0,col_1,col_1_ind
0,A,1
1,B,1
2,C,1
3,D,1
4,E,1


In [82]:
df1

Unnamed: 0,col_1,col_2_ind
0,A,1
1,B,1
2,C,1


#### concat
* https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [172]:
pd.concat([df, df1], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,col_1,col_1_ind,col_2_ind
0,A,1.0,
1,B,1.0,
2,C,1.0,
3,D,1.0,
4,E,1.0,
0,A,,1.0
1,B,,1.0
2,C,,1.0


#### merge
* https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.merge.html
* assumes inner join

In [84]:
df.merge(df1, how = "inner", left_on = "col_1", right_on = "col_1")

Unnamed: 0,col_1,col_1_ind,col_2_ind
0,A,1,1
1,B,1,1
2,C,1,1


In [85]:
df.merge(df1, how = "left", left_on = "col_1", right_on = "col_1")

Unnamed: 0,col_1,col_1_ind,col_2_ind
0,A,1,1.0
1,B,1,1.0
2,C,1,1.0
3,D,1,
4,E,1,


## Data Imputation
* https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html#pandas.DataFrame.interpolate
* missing values
* When summing data, NA (missing) values will be treated as zero.
* If the data are all NA, the result will be 0.
* Cumulative methods like cumsum() and cumprod() ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use skipna=False.


In [173]:
import pandas as pd
data = [1,2,3,4,None,5,2,1]
df = pd.DataFrame(data)

In [174]:
df

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,4.0
4,
5,5.0
6,2.0
7,1.0


In [175]:
df.isna()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,True
5,False
6,False
7,False


In [176]:
df.notna()

Unnamed: 0,0
0,True
1,True
2,True
3,True
4,False
5,True
6,True
7,True


In [177]:
df.fillna(0)

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,4.0
4,0.0
5,5.0
6,2.0
7,1.0


In [91]:
df.fillna(100)

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,4.0
4,100.0
5,5.0
6,2.0
7,1.0


In [92]:
df.dropna(axis=0)

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,4.0
5,5.0
6,2.0
7,1.0


In [93]:
df.dropna(axis=1)

0
1
2
3
4
5
6
7


In [94]:
df.fillna(df.mean())

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,4.0
4,2.571429
5,5.0
6,2.0
7,1.0


In [95]:
df.fillna(df.median())

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,4.0
4,2.0
5,5.0
6,2.0
7,1.0


In [96]:
df[0].fillna(df[0].mean()).reset_index()

Unnamed: 0,index,0
0,0,1.0
1,1,2.0
2,2,3.0
3,3,4.0
4,4,2.571429
5,5,5.0
6,6,2.0
7,7,1.0


## Json data

In [97]:
print(os.getcwd())

dir(os.getcwd())

import glob
path = glob.glob('../data/ca*')
print(path)

NameError: name 'os' is not defined

In [179]:

columns = ["age", "job", "city"]
data = [
    [31, "data scientist", "chicago"],
    [28, "data scientist", "new york"],
    [28,None,None]
]

path = '../data/class.json'

df = pd.DataFrame(data, columns = columns)
df.to_json(path)

In [180]:
with open(path, "r") as file:
    line = file.readlines()
    for l in line[0:50]:
        print(l)

{"age":{"0":31,"1":28,"2":28},"job":{"0":"data scientist","1":"data scientist","2":null},"city":{"0":"chicago","1":"new york","2":null}}


df

In [102]:
df = pd.read_json('../data/ca.json')

ValueError: Expected object or value

In [181]:
df

Unnamed: 0,age,job,city
0,31,data scientist,chicago
1,28,data scientist,new york
2,28,,


* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.io.json.json_normalize.html

* Nested JSON files can be time consuming and difficult process to flatten and load into Pandas.
* Let’s unpack the works column into a standalone dataframe. We’ll also grab the flat columns.
* nycphil = json_normalize(d['programs'])

In [182]:
data = [
        {'id': 1, 'name': "Cole Volk",'fitness': 
             {'height': 130, 'weight': 60}},
    
        {'name': "Mose Reg",'fitness': 
             {'height': 130, 'weight': 60}},
    
        {'id': 2, 'name': 'Faye Raker','fitness': 
             {'height': 130, 'weight': 60}}
]

In [183]:
from pandas.io.json import json_normalize

In [184]:
json_normalize(data)

Unnamed: 0,id,name,fitness.height,fitness.weight
0,1.0,Cole Volk,130,60
1,,Mose Reg,130,60
2,2.0,Faye Raker,130,60


In [187]:
json_normalize(data, max_level = 0)

Unnamed: 0,id,name,fitness
0,1.0,Cole Volk,"{'height': 130, 'weight': 60}"
1,,Mose Reg,"{'height': 130, 'weight': 60}"
2,2.0,Faye Raker,"{'height': 130, 'weight': 60}"


In [188]:
data = [
        {'state': 'Florida','shortname': 'FL',
            'info': {'governor': 'Rick Scott', "gender":"m"},
            'counties': [{'name': 'Dade', 'population': 12345},
                         {'name': 'Broward', 'population': 40000},
                         {'name': 'Palm Beach', 'population': 60000}]},
    
         {'state': 'Ohio', 'shortname': 'OH',
          'info': {'governor': 'John Kasich', "gender":"m"},
          'counties': [{'name': 'Summit', 'population': 1234},
                       {'name': 'Cuyahoga', 'population': 1337}]}
]

In [192]:
json_normalize(data)

Unnamed: 0,state,shortname,counties,info.governor,info.gender
0,Florida,FL,"[{'name': 'Dade', 'population': 12345}, {'name...",Rick Scott,m
1,Ohio,OH,"[{'name': 'Summit', 'population': 1234}, {'nam...",John Kasich,m


In [313]:
json_normalize(data, max_level = 1)

Unnamed: 0,state,shortname,counties,info.governor,info.gender
0,Florida,FL,"[{'name': 'Dade', 'population': 12345}, {'name...",Rick Scott,m
1,Ohio,OH,"[{'name': 'Summit', 'population': 1234}, {'nam...",John Kasich,m


In [33]:
json_normalize(data)

Unnamed: 0,state,shortname,counties,info.governor,info.gender
0,Florida,FL,"[{'name': 'Dade', 'population': 12345}, {'name...",Rick Scott,m
1,Ohio,OH,"[{'name': 'Summit', 'population': 1234}, {'nam...",John Kasich,m


In [193]:
# make each row a county, then start parsing data as such
json_normalize(data, "counties", [["info","governor"], ["info", "gender"]])

Unnamed: 0,name,population,info.governor,info.gender
0,Dade,12345,Rick Scott,m
1,Broward,40000,Rick Scott,m
2,Palm Beach,60000,Rick Scott,m
3,Summit,1234,John Kasich,m
4,Cuyahoga,1337,John Kasich,m


## Chunks
* Can use chunks to process pieces of a dataframe at a time if it won't fit into memory

In [194]:
import pandas as pd

In [198]:
chunk_size = 15

In [199]:
path = "../data/iris.csv"

In [211]:
for k in blocks

SyntaxError: invalid syntax (<ipython-input-211-cea6666cdc75>, line 1)

In [212]:
for k in [1,2,3,4]:
    a = pd.read_csv(path, skiprows=0+k*chunk_size, nrows = chunk_size)
    print(a)

    5.8    4  1.2   .2  Setosa
0   5.7  4.4  1.5  0.4  Setosa
1   5.4  3.9  1.3  0.4  Setosa
2   5.1  3.5  1.4  0.3  Setosa
3   5.7  3.8  1.7  0.3  Setosa
4   5.1  3.8  1.5  0.3  Setosa
5   5.4  3.4  1.7  0.2  Setosa
6   5.1  3.7  1.5  0.4  Setosa
7   4.6  3.6  1.0  0.2  Setosa
8   5.1  3.3  1.7  0.5  Setosa
9   4.8  3.4  1.9  0.2  Setosa
10  5.0  3.0  1.6  0.2  Setosa
11  5.0  3.4  1.6  0.4  Setosa
12  5.2  3.5  1.5  0.2  Setosa
13  5.2  3.4  1.4  0.2  Setosa
14  4.7  3.2  1.6  0.2  Setosa
    4.7  3.2  1.6   .2  Setosa
0   4.8  3.1  1.6  0.2  Setosa
1   5.4  3.4  1.5  0.4  Setosa
2   5.2  4.1  1.5  0.1  Setosa
3   5.5  4.2  1.4  0.2  Setosa
4   4.9  3.1  1.5  0.2  Setosa
5   5.0  3.2  1.2  0.2  Setosa
6   5.5  3.5  1.3  0.2  Setosa
7   4.9  3.6  1.4  0.1  Setosa
8   4.4  3.0  1.3  0.2  Setosa
9   5.1  3.4  1.5  0.2  Setosa
10  5.0  3.5  1.3  0.3  Setosa
11  4.5  2.3  1.3  0.3  Setosa
12  4.4  3.2  1.3  0.2  Setosa
13  5.0  3.5  1.6  0.6  Setosa
14  5.1  3.8  1.9  0.4  Setosa
    5.1 

In [210]:
print(a)

      5  3.3  1.4   .2      Setosa
0   7.0  3.2  4.7  1.4  Versicolor
1   6.4  3.2  4.5  1.5  Versicolor
2   6.9  3.1  4.9  1.5  Versicolor
3   5.5  2.3  4.0  1.3  Versicolor
4   6.5  2.8  4.6  1.5  Versicolor
5   5.7  2.8  4.5  1.3  Versicolor
6   6.3  3.3  4.7  1.6  Versicolor
7   4.9  2.4  3.3  1.0  Versicolor
8   6.6  2.9  4.6  1.3  Versicolor
9   5.2  2.7  3.9  1.4  Versicolor
10  5.0  2.0  3.5  1.0  Versicolor
11  5.9  3.0  4.2  1.5  Versicolor
12  6.0  2.2  4.0  1.0  Versicolor
13  6.1  2.9  4.7  1.4  Versicolor
14  5.6  2.9  3.6  1.3  Versicolor
