# Mod08 Introducing DataFrame Object

In [2]:
import numpy as np
import pandas as pd

## The Pandas DataFrame Object

### Constructing DataFrame objects
<details>
    <summary><b>dataframe結構圖</b></summary>
    <img src='./img/creating_dataframe1.png'>
</details>
<details>
    <summary><b>dataframe的Series結構圖</b></summary>
    <details>
        <summary><b>columns' series </b></summary>
        <img src='./img/dataSER-1.png'>
    </details>
    <details>
        <summary><b>index lables' series </b></summary>
        <img src='./img/dataSER-2.png'>
    </details>
</details>


#### From a single Series object

In [3]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [3]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


#### From a two-dimensional NumPy array

In [6]:
np.random.seed(10)

np.random.np.random.seed(10)
rand(3, 2)

array([[0.77132064, 0.02075195],
       [0.63364823, 0.74880388],
       [0.49850701, 0.22479665]])

In [9]:
np.random.seed(10)
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
0,0.771321,0.020752
1,0.633648,0.748804
2,0.498507,0.224797


In [16]:
np.random.seed(10)
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'])

Unnamed: 0,foo,bar
0,0.771321,0.020752
1,0.633648,0.748804
2,0.498507,0.224797


#### From a dictionary of Series objects

In [5]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [10]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [12]:
pd.DataFrame({'population': population,
              'area': area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


#### From a list of dicts

In [14]:
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
data

[{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]

In [15]:
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


### Attribute of DataFrame 

In [9]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


Like the ``Series`` object, the ``DataFrame`` has an ``index`` attribute

In [5]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

``DataFrame`` has a ``columns`` attribute

In [6]:
states.columns

Index(['population', 'area'], dtype='object')

In [7]:
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]], dtype=int64)

### Import files

In [29]:
mb = pd.read_csv("./data/microbiome.csv")
mb

Unnamed: 0,Taxon,Patient,Group,Tissue,Stool
0,Firmicutes,1,0,136,4182
1,Firmicutes,2,1,1174,703
2,Firmicutes,3,0,408,3946
3,Firmicutes,4,1,831,8605
4,Firmicutes,5,0,693,50
...,...,...,...,...,...
65,Other,10,1,203,6
66,Other,11,0,392,6
67,Other,12,1,28,25
68,Other,13,0,12,22


In [17]:
pd.read_csv("./data/microbiome.csv", header=None).head()

Unnamed: 0,0,1,2,3,4
0,Taxon,Patient,Group,Tissue,Stool
1,Firmicutes,1,0,136,4182
2,Firmicutes,2,1,1174,703
3,Firmicutes,3,0,408,3946
4,Firmicutes,4,1,831,8605


In [5]:
mb = pd.read_table("./data/microbiome.csv", sep=',')

In [2]:
dx = pd.read_excel('./data/MID2.xls', sheet_name='Sheet 1', header=None)
dx.head()

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for Excel support Use pip or conda to install xlrd.

In [15]:
# Load the first sheet of the JSON file into a data frame
dj = pd.read_json('data/data.json')

# View the first ten rows
dj.head(10)

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0
2,9,2015-01-01 00:00:02,0
3,6,2015-01-01 00:00:03,0
4,6,2015-01-01 00:00:04,0
5,9,2015-01-01 00:00:05,0
6,7,2015-01-01 00:00:06,0
7,1,2015-01-01 00:00:07,0
8,6,2015-01-01 00:00:08,0
9,9,2015-01-01 00:00:09,0


## Lab

<b>有一個 Dictionary 如下，透過該 Dictionary 建立一個 DataFrame，並顯示 columns 屬性</b>

In [None]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

<b>匯入 data/tips.csv 的檔案到 DataFrame，顯示 DataFrame 前 10 筆資料，並顯示 dtypes 屬性</b>

In [None]:
df = pd.read_csv('data/tips.csv')