In [4]:
import pandas as pd
import numpy as np

# Series

## Series Creation

In [10]:
obj = pd.Series([1, 2, 3, 4])
print(obj.values, obj.index)

[1 2 3 4] RangeIndex(start=0, stop=4, step=1)


In [13]:
obj2 = pd.Series([4, 5, 1, 2], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    5
a    1
c    2
dtype: int64

### dictionary to series

In [38]:
cdata = {
    "Seoul": 1000,
    "Busan": 1500,
    "Incheon": 100,
    "Gwangju": 200
}

obj3 = pd.Series(cdata)
obj3

Seoul      1000
Busan      1500
Incheon     100
Gwangju     200
dtype: int64

In [39]:
cdata2 = ["Suwon", "Busan", "Incheon", "Gwangju"]

obj4 = pd.Series(cdata, index=cdata2)
obj4

Suwon         NaN
Busan      1500.0
Incheon     100.0
Gwangju     200.0
dtype: float64

In [40]:
pd.isnull(obj4)

Suwon       True
Busan      False
Incheon    False
Gwangju    False
dtype: bool

In [41]:
pd.notnull(obj4)

Suwon      False
Busan       True
Incheon     True
Gwangju     True
dtype: bool

## Indexing and Selecting Data

In [23]:
obj2['d'] = 6
print(obj2['a'])
print(obj2)

1
d    6
b    5
a    1
c    2
dtype: int64


### Selection by label

In [25]:
obj2[['c', 'a', 'd']]

c    2
a    1
d    6
dtype: int64

### Labeling data and index

In [49]:
obj4.name = "population"
obj4

Suwon         NaN
Busan      1500.0
Incheon     100.0
Gwangju     200.0
Name: population, dtype: float64

In [50]:
obj4.index

Index(['Suwon', 'Busan', 'Incheon', 'Gwangju'], dtype='object')

In [51]:
obj4.index.name = 'city'
obj4

city
Suwon         NaN
Busan      1500.0
Incheon     100.0
Gwangju     200.0
Name: population, dtype: float64

In [52]:
obj

0    1
1    2
2    3
3    4
dtype: int64

In [58]:
obj.index = ['a', 'b', 'c', 'd']
obj.index.name = "name"
obj.name = "numbers"
obj

name
a    1
b    2
c    3
d    4
Name: numbers, dtype: int64

### Boolean indexing

In [27]:
obj2[obj2 > 1]

d    6
b    5
c    2
dtype: int64

In [32]:
'b' in obj2

True

### Indexing

In [43]:
obj3 + obj4

Busan      3000.0
Gwangju     400.0
Incheon     200.0
Seoul         NaN
Suwon         NaN
dtype: float64

### Vectorized operations

In [30]:
obj2 * 2

d    12
b    10
a     2
c     4
dtype: int64

In [31]:
np.exp(obj2)

d    403.428793
b    148.413159
a      2.718282
c      7.389056
dtype: float64

### NaN
`NaN + value = NaN`  
`pd.notnull()`  
`pd.isnull()`

# Dataframe

## Dataframe Creation

In [68]:
data = {
    'city': ['Seoul', 'Daegu', 'Busan', 'Gwangju'],
    'year': [2001, 2001, 2003, 2002],
    'pop': [9.0, 2.0, 3.0, 0.1]
}

df = pd.DataFrame(data)
df

Unnamed: 0,city,year,pop
0,Seoul,2001,9.0
1,Daegu,2001,2.0
2,Busan,2003,3.0
3,Gwangju,2002,0.1


In [70]:
### Reordering columns

In [69]:
pd.DataFrame(data, columns=['year', 'city', 'pop'])

Unnamed: 0,year,city,pop
0,2001,Seoul,9.0
1,2001,Daegu,2.0
2,2003,Busan,3.0
3,2002,Gwangju,0.1


In [71]:
### Appending columns

In [72]:
pd.DataFrame(data, columns=['year', 'city', 'pop', 'loc'])

Unnamed: 0,year,city,pop,loc
0,2001,Seoul,9.0,
1,2001,Daegu,2.0,
2,2003,Busan,3.0,
3,2002,Gwangju,0.1,


In [74]:
### Indexing rows

In [77]:
df2 = pd.DataFrame(data, columns=['year', 'city', 'pop', 'location'],
            index=['one', 'two', 'three', 'four'])
df2

Unnamed: 0,year,city,pop,location
one,2001,Seoul,9.0,
two,2001,Daegu,2.0,
three,2003,Busan,3.0,
four,2002,Gwangju,0.1,


In [78]:
### selecting data

In [79]:
df2['year']

one      2001
two      2001
three    2003
four     2002
Name: year, dtype: int64

In [80]:
df2.loc['two']

year         2001
city        Daegu
pop             2
location      NaN
Name: two, dtype: object

In [84]:
del df2['location']
df2

Unnamed: 0,year,city,pop
one,2001,Seoul,9.0
two,2001,Daegu,2.0
three,2003,Busan,3.0
four,2002,Gwangju,0.1


In [86]:
### Nested Dictionary to DataFrame

In [96]:
pop = {'Jeonju': {2001: 30, 
                  2002: 35},
       'Daejeon': {2001: 80, 
                   2002: 90}}

df4 = pd.DataFrame(pop)
df4

Unnamed: 0,Jeonju,Daejeon
2001,30,80
2002,35,90


In [97]:
df4.T

Unnamed: 0,2001,2002
Jeonju,30,35
Daejeon,80,90


In [98]:
pop = {'Jeonju': {2001: 30, 
                  2002: 35},
       'Daejeon': {2001: 80, 
                   2020: 90}}

df4 = pd.DataFrame(pop)
df4

Unnamed: 0,Jeonju,Daejeon
2001,30.0,80.0
2002,35.0,
2020,,90.0


In [100]:
df4.index.name = 'year'
df4

Unnamed: 0_level_0,Jeonju,Daejeon
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,30.0,80.0
2002,35.0,
2020,,90.0


In [102]:
df4.columns.name = 'city'
df4

city,Jeonju,Daejeon
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,30.0,80.0
2002,35.0,
2020,,90.0


In [103]:
df4.values

array([[30., 80.],
       [35., nan],
       [nan, 90.]])

In [105]:
obj = pd.Series(range(11, 14), index=['a', 'b', 'c'])
obj

a    11
b    12
c    13
dtype: int64

In [106]:
labels = pd.Index(np.arange(100, 103))
labels

Int64Index([100, 101, 102], dtype='int64')

In [107]:
obj.index = labels

In [109]:
obj.index is labels

True

In [111]:
'city' in df2.columns

True

In [113]:
'one' in df2.index

True

In [115]:
### reindexing

In [117]:
obj = pd.Series([4.1, 7.1, -5.5, 3.3], index=['d', 'b', 'a', 'c'])
obj

d    4.1
b    7.1
a   -5.5
c    3.3
dtype: float64

In [119]:
obj = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj

a   -5.5
b    7.1
c    3.3
d    4.1
e    NaN
dtype: float64

In [120]:
obj3 = pd.Series(['data1', 'data2', 'data3'], index=[0, 2, 4])
obj3

0    data1
2    data2
4    data3
dtype: object

In [121]:
obj3.reindex(range(5), method='ffill') # foward(위->아래) 방향 fill

0    data1
1    data1
2    data2
3    data2
4    data3
dtype: object

In [122]:
obj3.reindex(range(5), method='bfill') # backward(아래->위) 방향 fill

0    data1
1    data2
2    data2
3    data3
4    data3
dtype: object

### del vs. drop
del: inplace=True  
drop: inplace=False

In [166]:
obj = pd.Series(np.arange(11, 16), index=['a', 'b', 'c', 'd', 'e'])
obj

a    11
b    12
c    13
d    14
e    15
dtype: int64

In [162]:
del obj['c']
obj

a    11
b    12
d    14
e    15
dtype: int64

In [165]:
new_obj = obj.drop('c')
new_obj

a    11
b    12
d    14
e    15
dtype: int64

In [170]:
new_obj = obj.drop(['c', 'd'])
new_obj

a    11
b    12
e    15
dtype: int64

In [176]:
data = pd.DataFrame(np.arange(16).reshape(4, 4),
                   index=['Gwangju', 'Seoul', 'Jeju', 'Busan'],
                   columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Gwangju,0,1,2,3
Seoul,4,5,6,7
Jeju,8,9,10,11
Busan,12,13,14,15


In [184]:
data.drop(['Seoul', 'Jeju'])

Unnamed: 0,one,two,three,four
Busan,12,13,14,15
Gwangju,0,1,2,3


In [187]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Seoul,4,6,7
Busan,12,14,15
Gwangju,0,2,3
Jeju,8,10,11


In [191]:
data

Unnamed: 0,one,two,three,four
Seoul,4,5,6,7
Busan,12,13,14,15
Gwangju,0,1,2,3
Jeju,8,9,10,11


In [196]:
data.drop(data.index[2:])

Unnamed: 0,one,two,three,four
Seoul,4,5,6,7
Busan,12,13,14,15


In [214]:
data.drop(data.loc[:, 'two'::2], axis=1)

Unnamed: 0,one,three
Seoul,4,6
Busan,12,14
Gwangju,0,2
Jeju,8,10


In [212]:
data.loc[:, 'two'::2]

Unnamed: 0,two,four
Seoul,5,7
Busan,13,15
Gwangju,1,3
Jeju,9,11


## File I/O

In [224]:
df = pd.read_csv("../file_io/test_csv_file.csv")
df

Unnamed: 0,ID,LAST_NAME,AGE
0,1,KIM,30
1,2,CHOI,25
2,3,LEE,41
3,4,PARK,19
4,5,LIM,36


In [222]:
type(df)

pandas.core.frame.DataFrame

In [226]:
df = pd.read_csv("../file_io/test_text_file.txt")
df

Unnamed: 0,ID|A|B|C|D
0,C1|1|2|3|4
1,C2|5|6|7|8
2,C3|1|3|5|7


In [242]:
df = pd.read_csv("../file_io/test_text_file.txt", sep="|")
df

Unnamed: 0,ID,A,B,C,D
0,C1,1,2,3,4
1,C2,5,6,7,8
2,C3,1,3,5,7


In [243]:
df.set_index("ID", inplace=True)
df

Unnamed: 0_level_0,A,B,C,D
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C1,1,2,3,4
C2,5,6,7,8
C3,1,3,5,7


In [247]:
df = pd.read_csv("../file_io/test_text_file.txt", 
                sep="|",
                index_col="ID")
df

Unnamed: 0_level_0,A,B,C,D
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C1,1,2,3,4
C2,5,6,7,8
C3,1,3,5,7


In [246]:
df = pd.read_csv("../pandas_data/gapminder.tsv", sep="\t")
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [269]:
df = pd.read_csv("../file_io/text_without_column_name.txt",
                sep="|",
                header=None,
                names=["ID", "A", "B", "C", "D"],
                index_col = "ID")

df

Unnamed: 0_level_0,A,B,C,D
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C1,1,2,3,4
C2,5,6,7,8
C3,1,3,5,7


In [271]:
df = pd.read_csv("../file_io/test_text_file.txt",
                sep='|')
df

Unnamed: 0,ID,A,B,C,D
0,C1,1,2,3,??
1,C2,모름,6,7,8
2,C3,1,없음,5,7


In [273]:
df = pd.read_csv("../file_io/test_text_file.txt",
                sep='|',
                index_col='ID',
                na_values=["없음", "??", "모름"])
df

Unnamed: 0_level_0,A,B,C,D
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C1,1.0,2.0,3,
C2,,6.0,7,8.0
C3,1.0,,5,7.0


In [278]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, C1 to C3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       2 non-null      float64
 1   B       2 non-null      float64
 2   C       3 non-null      int64  
 3   D       2 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 200.0+ bytes
