In [1]:
import numpy as np # NumPy
import pandas as pd # Pandas

# IPython( jupyter notebook ) 디스플레이 설정 변경
# 출력할 열의 개수 한도 설정
pd.set_option( 'display.max_columns', 20 )
# 출력할 열의 너비 설정
pd.set_option( 'display.max_colwidth', 20 )
# 유니코드 사용 너비 조정
pd.set_option( 'display.unicode.east_asian_width', True )

# 경고( warnings ) 메시지 출력 방지
import warnings
warnings.filterwarnings( 'ignore' )

# 데이터 전처리 (Data Preprocessing)
## 타이타닉 데이터셋 변수 설명
    * PassengerId : 탑승자 일련번호
    * Suvived : 생존 여부 ( 0 : 사망, 1 : 생존 )
    * Pclass : 티켓 선실 등급 (1: 일등석, 2: 이등석, 3: 삼등석)
    * Name : 탑승자 이름
    * Sex : 탑승자 성별
    * Age : 탑승자 나이
    * SibSp : 같이 탑승한 형제/자매 또는 배우자 인원수
    * Parch : 같이 탑승한 부모 또는 어린이 인원수
    * Ticket : 티켓 번호
    * Fare : 요금
    * Cabin : 선실 번호
    * Embarked : 탑승 도시명

### 타이타닉 데이터셋 데이터프레임 생성

In [18]:
titanic = pd.read_csv("./data/titanic.csv")

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen...",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. Jo...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss....",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. J...",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. Willi...",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. J...",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Ma...",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. ...",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl H...",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [6]:
titanic.shape

(891, 12)

In [8]:
titanic.info() # 결측치가 다소 있는 데이터

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## 결측치 처리
### 결측치 확인

In [9]:
nan_cabin = titanic['Cabin'].value_counts(dropna = False) # 결측치 개수 출력
nan_cabin

NaN            687
B96 B98          4
C23 C25 C27      4
G6               4
E101             3
              ... 
C111             1
B19              1
B50              1
D30              1
D9               1
Name: Cabin, Length: 148, dtype: int64

In [10]:
nan_cabin = titanic['Cabin'].value_counts(dropna = True) # 결측치 제외 개수 출력
nan_cabin

G6             4
B96 B98        4
C23 C25 C27    4
F33            3
D              3
              ..
C111           1
B19            1
B50            1
D30            1
C99            1
Name: Cabin, Length: 147, dtype: int64

In [11]:
nan_age = titanic['Age'].value_counts(dropna = False )
nan_age

NaN      177
24.00     30
22.00     27
18.00     26
28.00     25
        ... 
36.50      1
55.50      1
66.00      1
23.50      1
0.42       1
Name: Age, Length: 89, dtype: int64

In [12]:
nan_embarked = titanic['Embarked'].value_counts(dropna = False)
nan_embarked

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [14]:
titanic.head().isnull() # 정사치 : False , 결측치 : True

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False


In [15]:
titanic.head().notnull() # 정사치 : True , 결측치 : False

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,True,True,True,True,True,True,True,True,True,True,False,True
1,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,False,True
3,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,False,True


In [16]:
titanic.head().isnull().sum(axis=0) # 정사치 : False , 결측치 : True

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          3
Embarked       0
dtype: int64

In [20]:
titanic.isnull().sum(axis=0) # 정사치 : False , 결측치 : True

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### 결측치 처리

#### 1. 결측치 삭제 방법

In [21]:
# titanic 데이터프레임에서 NaN의 개수가 500개 이상인 변수 삭제
df_trash = titanic.dropna(axis = 1, thresh = 500)
df_trash.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen...",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. Jo...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss....",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. J...",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. Willi...",male,35.0,0,0,373450,8.05,S


In [22]:
df_trash.isnull().sum(axis = 0)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [24]:
df_trash.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

#### 특정값으로 대체  (보편적으로 평균값 사용)

In [26]:
mean_age = titanic['Age'].mean(axis = 0)
mean_age

29.69911764705882

In [27]:
df = titanic[:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [28]:
df.dropna(axis = 1, thresh = 500, inplace =True)
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [30]:
df.info() # cabin 변수 삭제

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


#### 평균값으로 대체 'Age' 변수에 적용

In [31]:
df['Age'].fillna(mean_age, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


### 주변 값으로 대체 - 'Embarked' 변수에 적용

In [32]:
df['Embarked'][825:830]

825      Q
826      S
827      C
828      Q
829    NaN
Name: Embarked, dtype: object

### Embarked 변수 내용 중 탑승 인원이 가장 많은 도시 이름으로 대체

In [33]:
most_freq = df['Embarked'].value_counts(dropna =True).idxmax()
most_freq

'S'

In [34]:
df2 = df[:]
df2['Embarked'].fillna(most_freq, inplace = True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [38]:
df2['Embarked'][825:830]

825    Q
826    S
827    C
828    Q
829    S
Name: Embarked, dtype: object

#### Emabrked 변수중 결측치가 있는 원소의 이전 원소로 대체

In [47]:
df3 = titanic[ : ]
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [48]:
df3['Embarked'][825:830]

825      Q
826      S
827      C
828      Q
829    NaN
Name: Embarked, dtype: object

In [49]:
df3['Embarked'].fillna(method = 'ffill', inplace = True) # 이전의??
df3['Embarked'][825:830]

825    Q
826    S
827    C
828    Q
829    Q
Name: Embarked, dtype: object

###  중복 데이터 처리

In [40]:
df = pd.DataFrame({'c1':['a','a','b','a','b'],
                  'c2' : [1,1,1,2,2],
                  'c3' : [1,1,2,2,2]})
df

Unnamed: 0,c1,c2,c3
0,a,1,1
1,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [43]:
# 행(데이터)에 대한 중복 데이터 확인
df_dup = df.duplicated()
df_dup

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [44]:
# 열(변수)에 대한 중복 데이터 확인
col_dup = df['c2'].duplicated()
col_dup

0    False
1     True
2     True
3    False
4     True
Name: c2, dtype: bool

In [46]:
# 행(데이터) 삭제 - 열(변수)을 기준으로 중복 행(데이터) 삭제
df2 = df.drop_duplicates(subset = ['c2','c3'])
df2

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
