In [1]:
import pandas as pd
import numpy as np

##  중복 제거하기

In [3]:
data = pd.DataFrame({'k1': ['one'] *3 + ['two']*4,
                     'k2': [1,1,2,3,3,4,4]})

In [4]:
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


### 행의 중복이 있는 지를 알아본다.
  
     중복 데이터가 있는지를 확인 

In [5]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

### 중복된 값을 제거하다.

    위의 메소드에서 True인 부분이 전부 제거된다.

In [6]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


### 없는 칼럼에 데이터를 할당하면 칼럼이 추가된다.

In [7]:
data['v1'] = range(7)

In [8]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


### 특정 칼럼으로 중복을 제거하기

In [9]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


### 처음에 있는 값으로 표시하기

In [11]:
data.drop_duplicates(['k1','k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


### 맨 마지막에 있는 값으로 표시하기

In [13]:
help(data.drop_duplicates)

Help on method drop_duplicates in module pandas.core.frame:

drop_duplicates(subset=None, keep='first', inplace=False) method of pandas.core.frame.DataFrame instance
    Return DataFrame with duplicate rows removed, optionally only
    considering certain columns
    
    Parameters
    ----------
    subset : column label or sequence of labels, optional
        Only consider certain columns for identifying duplicates, by
        default use all of the columns
    keep : {'first', 'last', False}, default 'first'
        - ``first`` : Drop duplicates except for the first occurrence.
        - ``last`` : Drop duplicates except for the last occurrence.
        - False : Drop all duplicates.
    inplace : boolean, default False
        Whether to drop duplicates in place or to return a copy
    
    Returns
    -------
    deduplicated : DataFrame



In [14]:
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


In [28]:
data1 = pd.DataFrame({ 'food' : ['bacon','pulled fork','bacon','Pastrami', 'corned beef', 'Bacon','pastrami',
                                'honey ham','nova lox'],
                     'ounce' : [4,3,12,6,7.5,8,3,5,6]})

In [29]:
data1

Unnamed: 0,food,ounce
0,bacon,4.0
1,pulled fork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [30]:
meat_to_animal = {
    'bacon' : 'pig',
    'pulled fork' : 'pig',
    'pastrami' : 'cow',
    'corned beef' : 'cow',
    'honey ham' : 'pig',
    'nova lox' : 'salmon'
 }

In [31]:
meat_to_animal

{'bacon': 'pig',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'pastrami': 'cow',
 'pulled fork': 'pig'}

## 대소문자 맞추기

### 시리즈 내의 데이터를 map  메소드로 변형하기

In [32]:
help(data1['food'].map)

Help on method map in module pandas.core.series:

map(arg, na_action=None) method of pandas.core.series.Series instance
    Map values of Series using input correspondence (which can be
    a dict, Series, or function)
    
    Parameters
    ----------
    arg : function, dict, or Series
    na_action : {None, 'ignore'}
        If 'ignore', propagate NA values, without passing them to the
        mapping function
    
    Returns
    -------
    y : Series
        same index as caller
    
    Examples
    --------
    
    Map inputs to outputs (both of type `Series`)
    
    >>> x = pd.Series([1,2,3], index=['one', 'two', 'three'])
    >>> x
    one      1
    two      2
    three    3
    dtype: int64
    
    >>> y = pd.Series(['foo', 'bar', 'baz'], index=[1,2,3])
    >>> y
    1    foo
    2    bar
    3    baz
    
    >>> x.map(y)
    one   foo
    two   bar
    three baz
    
    If `arg` is a dictionary, return a new Series with values converted
    according to the dictiona

In [33]:
data1['animal'] = data1['food'].map(str.lower).map(meat_to_animal)

In [34]:
data1['food'].map(str.lower)

0          bacon
1    pulled fork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [35]:
data1['food'].map(str.lower).map(meat_to_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [36]:
data1

Unnamed: 0,food,ounce,animal
0,bacon,4.0,pig
1,pulled fork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


# 값 치환하기

In [37]:
data2 = pd.Series([1,-999,2, -999, -1000, 3.])

In [38]:
data2

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

### 특정 값을 변굥한다,

In [40]:
data_r2 = data2.replace(-999, np.nan)

In [41]:
data_r2

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [42]:
data3 = pd.Series([1,-999,2, -999, -1000, 3.])

### 여러 개를 변경하기

In [45]:
data_r3 = data3.replace([-999,-1000], [np.nan,0])

In [46]:
data_r3

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 특정값을 배개해서 정리하기

In [47]:
data_r4 = data3.replace({-999: np.nan,0:-1000})

In [48]:
data_r4

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64