## Series & DataFrame

Create Series from list

In [1]:
from IPython import display
import pandas as pd

pd.Series([1, 2, 3])

0    1
1    2
2    3
dtype: int64

In [2]:
pd.Series([1, 2, 3], index=['a', 'b', 'c'])

a    1
b    2
c    3
dtype: int64

Create DataFrame from dictionary

In [3]:
pd.DataFrame({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9]
})

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [4]:
pd.DataFrame(
    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    columns=['a', 'b', 'c'],
    index=['P1', 'P2', 'P3'],
)

Unnamed: 0,a,b,c
P1,1,2,3
P2,4,5,6
P3,7,8,9


Pick up DataFrame's N columns (DataFrame to DataFrame)

In [5]:
df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
}, index = ['P1', 'P2', 'P3'])
out = df[['a', 'b']]

print(type(out))
out

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,a,b
P1,1,4
P2,2,5
P3,3,6


Pick up DataFrame's 1 column (DataFrame to Series)

In [6]:
s = df.a

print(type(s))
s

<class 'pandas.core.series.Series'>


P1    1
P2    2
P3    3
Name: a, dtype: int64

Pick up DataFrame's row

In [7]:
df.iloc[1]

a    2
b    5
c    8
Name: P2, dtype: int64

In [8]:
df.iloc[:2]

Unnamed: 0,a,b,c
P1,1,4,7
P2,2,5,8


Get Numpy NdArray from DataFrame / Series

In [9]:
print(type(df.values))
df.values

<class 'numpy.ndarray'>


array([[1, 4, 7],
       [2, 5, 8],
       [3, 6, 9]])

In [10]:
s.values

array([1, 2, 3])

Get Dict from DataFrame

In [11]:
df.to_dict()

{'a': {'P1': 1, 'P2': 2, 'P3': 3},
 'b': {'P1': 4, 'P2': 5, 'P3': 6},
 'c': {'P1': 7, 'P2': 8, 'P3': 9}}

Index data

In [12]:
print(type(df.index))
print(df.index)  # list-like object

<class 'pandas.core.indexes.base.Index'>
Index(['P1', 'P2', 'P3'], dtype='object')


Iteration

In [13]:
df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': [7, 8, 9],
})

for column_name in df:
    print(column_name)

a
b
c


In [14]:
for row in df.iterrows():
    name, s = row
    print(s)  # Series

a    1
b    4
c    7
Name: 0, dtype: int64
a    2
b    5
c    8
Name: 1, dtype: int64
a    3
b    6
c    9
Name: 2, dtype: int64


In [15]:
# itertuples() is faster than iterrows()
for t in df.itertuples():
    print(t[1:])  # Tuple

(1, 4, 7)
(2, 5, 8)
(3, 6, 9)


In [16]:
for col in df.iteritems():
    name, s = col
    print(s)  # Series

0    1
1    2
2    3
Name: a, dtype: int64
0    4
1    5
2    6
Name: b, dtype: int64
0    7
1    8
2    9
Name: c, dtype: int64


### Read data

In [17]:
#!wget https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv

In [18]:
from IPython import display
import pandas as pd

pd.read_csv("titanic.csv")  # read_csv(filename, header='infer', index_col=None)
# pd.read_hdf("test.hdf")
# pd.read_json("test.json")
# pd.read_excel("test.xlsx")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [19]:
pd.read_csv("titanic.csv", index_col="PassengerId")

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Display DataFrame's data

In [20]:
import numpy as np
df = pd.DataFrame(
    np.random.randint(0, 10, (5, 3)),
    columns=['a', 'b', 'c'],
)
df

Unnamed: 0,a,b,c
0,8,2,6
1,6,4,7
2,3,1,6
3,3,3,3
4,5,1,7


In [21]:
df.head(2)

Unnamed: 0,a,b,c
0,8,2,6
1,6,4,7


In [22]:
df.tail(2)

Unnamed: 0,a,b,c
3,3,3,3
4,5,1,7


In [23]:
df.shape

(5, 3)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       5 non-null      int64
 1   b       5 non-null      int64
 2   c       5 non-null      int64
dtypes: int64(3)
memory usage: 248.0 bytes


In [25]:
df.describe()

Unnamed: 0,a,b,c
count,5.0,5.0,5.0
mean,5.0,2.2,5.8
std,2.12132,1.30384,1.643168
min,3.0,1.0,3.0
25%,3.0,1.0,6.0
50%,5.0,2.0,6.0
75%,6.0,3.0,7.0
max,8.0,4.0,7.0


## query

In [28]:
df.query('b > 2')

Unnamed: 0,a,b,c
1,6,4,7
3,3,3,3


In [29]:
df.query("b > 2 & c < 6")

Unnamed: 0,a,b,c
3,3,3,3


## sort

In [None]:
df.sort_values('b')

In [None]:
from scipy.stats.mstats import rankdata
df.assign(order=rankdata(df['b']))

##  Operation

Append Column

In [None]:
df['d'] = 0
df

In [None]:
df.assign(d=0)  # create new DataFrame instead of updating

In [None]:
df['d']=[1,2,3,4,5]
df

In [None]:
df['d'] = df['a'] * df['c']
df

In [None]:
df['e'] = (df['d'] > 30)
df

Append row 

In [None]:
df.append({'a': 90, 'b': 80, 'c': 70, 'd': 60, 'e': False}, ignore_index=True)