# 데이터 분석 순서(KDD 분석 방법론)
* 데이터 세트 선택: csv, excel, DB에서 데이터를 읽어옴
* 데이터 전처리: 데이터 타입, 결측값, 이상값 탐지, 데이터 분포 분석, 상관관계
* 데이터 변환(특성추출): 원본 데이터에서 새로운 데이터 생성, 삭제, 스케일링, 구간화
* 데이터 마이닝(모델링, 분석): 분석에 적합한 알고리즘 선택, 모델 생성, 튜닝
* 결과 평가: 테스트 데이터를 이용해서 데이터 마이닝으로 만든 모델의 성능 평가

# 데이터 전처리
* 데이터 타입 변환
* 결측치 탐지 및 보간
* 이상치 탐지 및 처리
* 데이터 특성 파악(치우침, 분포 특성)
* 변수들 간의 상관관계 분석

In [1]:
import numpy as np 
import pandas as pd

# 1. 데이터 세트 선택 및 로딩
* 데이터 로드 후 head, tail, info, describe로 데이터의 구조 파악

In [2]:
data = pd.read_csv("./data/Titanic_train.csv")
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# 2. info()로 컬럼명, 결측치, 데이터 타입 파악

In [3]:
# 데이터가 너무 커서 Non-Null Count가 표시 안될 때: show_counts=True
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df = pd.read_csv("./data/movie_title_data.tsv", sep="\t")
df

  df = pd.read_csv("./data/movie_title_data.tsv", sep="\t")


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
8407394,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
8407395,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
8407396,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
8407397,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [5]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8407399 entries, 0 to 8407398
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   tconst          8407399 non-null  object
 1   titleType       8407399 non-null  object
 2   primaryTitle    8407385 non-null  object
 3   originalTitle   8407385 non-null  object
 4   isAdult         8407399 non-null  object
 5   startYear       8407399 non-null  object
 6   endYear         8407399 non-null  object
 7   runtimeMinutes  8407399 non-null  object
 8   genres          8407389 non-null  object
dtypes: object(9)
memory usage: 577.3+ MB


In [6]:
df['runtimeMinutes'].astype(int)

ValueError: invalid literal for int() with base 10: '\\N'

In [None]:
\\N, Documentary, Talk-Show, Game-Show, Animation,Comedy,Family

In [7]:
df['runtimeMinutes'].unique()

array(['1', '5', '4', '12', '40', '2', '\\N', '0', '3', '20', '13', '6',
       '11', '9', '10', '8', '15', '21', '16', '14', '7', '24', '100',
       '17', '70', '90', '25', '120', '36', '18', '30', '19', '44', '38',
       '58', '45', '50', '34', '28', '22', '35', '42', '23', '51', '52',
       '60', '92', '56', '53', '31', '41', '48', '68', '33', '29', '71',
       '43', '46', '27', '32', '76', '85', '55', '121', '89', '26', '37',
       '69', '54', '150', '64', '96', '49', '61', '39', '101', '300',
       '57', '99', '110', '88', '86', '170', '72', '78', '75', '450',
       '148', '124', '220', '59', '63', '112', '1428', '84', '65', '74',
       '105', '310', '73', '82', '81', '199', '139', '109', '67', '219',
       '80', '195', '440', '77', '293', '97', '62', '47', '250', '421',
       '360', '180', '66', '93', '163', '400', '104', '95', '140', '165',
       '116', '83', '320', '125', '127', '138', '460', '350', '374',
       '480', '91', '87', '130', '108', '79', '136', '94', '6

In [29]:
df[df['runtimeMinutes'] == "Talk-Show"]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
2030110,tt11868642,tvEpisode,GGN Heavyweight Championship Lungs With Mike T...,0,2020,\N,\N,Talk-Show,


In [22]:
df2 = df.iloc[2000110:2050110, :].copy()
df2

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
2000110,tt1181630,tvEpisode,Episode #1.1,Episode #1.1,0,2008,\N,\N,"Comedy,Game-Show"
2000111,tt11816310,tvEpisode,Episode #1.6000,Episode #1.6000,0,1979,\N,30,\N
2000112,tt11816312,tvEpisode,Building - Nest,Building - Nest,0,2009,\N,\N,Family
2000113,tt11816314,tvEpisode,Episode #1.1,Episode #1.1,0,2019,\N,\N,Talk-Show
2000114,tt11816316,tvSeries,Saturday Club,Saturday Club,0,2019,\N,3,"Animation,Family"
...,...,...,...,...,...,...,...,...,...
2050105,tt11903592,tvEpisode,Harvie vs. Hare,Harvie vs. Hare,0,2020,\N,22,"Comedy,Game-Show"
2050106,tt11903594,tvEpisode,Episode #16.101,Episode #16.101,0,\N,\N,\N,"Comedy,Drama,Music"
2050107,tt11903596,tvEpisode,Episode #16.102,Episode #16.102,0,\N,\N,\N,"Comedy,Drama,Music"
2050108,tt11903598,tvEpisode,Episode #16.104,Episode #16.104,0,\N,\N,\N,"Comedy,Drama,Music"


In [23]:
df2['runtimeMinutes'].astype(int)

ValueError: invalid literal for int() with base 10: '\\N'

In [24]:
df2['runtimeMinutes'].unique()

array(['\\N', '30', '3', '43', '60', '70', '36', '48', '41', '12', '52',
       '66', '24', '7', '14', '4', '62', '42', '55', '8', '29', '64',
       '46', '54', '26', '25', '51', '28', '107', '45', '5', '58', '17',
       '6', '39', '59', '44', '99', '11', '90', '2', '23', '121', '22',
       '10', '97', '85', '9', '31', '150', '47', '183', '115', '56', '21',
       '49', '53', '13', '50', '15', '16', '40', '102', '110', '124',
       '67', '57', '360', '116', '86', '203', '154', '27', '117', '120',
       '280', '75', '199', '95', '94', '118', '20', '77', '71', '188',
       '105', '69', '176', '239', '156', '139', '72', '78', '153', '74',
       '1', '32', '18', '38', '101', '73', '169', '220', '180', '82',
       '88', '103', '100', '123', '84', '65', '35', '98', '80', '87',
       '122', '344', '96', '33', '37', '200', '34', '331', '190', '93',
       '133', '358', '91', '83', '179', '19', '136', '158', '111', '126',
       '63', '61', '142', '79', '76', '104', '108', '163', '119'

In [28]:
def to_num(df):
    try:
        df['runtimeMinutes'] = int(df['runtimeMinutes'])
        return df
    except:
        if df['runtimeMinutes'] == '\\N':
            df['runtimeMinutes'] = 0
            return df
        else:
            df['genres'] = df['runtimeMinutes']
            df['runtimeMinutes'] = 0
            return df

In [30]:
df2.apply(to_num, axis=1)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
2000110,tt1181630,tvEpisode,Episode #1.1,Episode #1.1,0,2008,\N,0,"Comedy,Game-Show"
2000111,tt11816310,tvEpisode,Episode #1.6000,Episode #1.6000,0,1979,\N,30,\N
2000112,tt11816312,tvEpisode,Building - Nest,Building - Nest,0,2009,\N,0,Family
2000113,tt11816314,tvEpisode,Episode #1.1,Episode #1.1,0,2019,\N,0,Talk-Show
2000114,tt11816316,tvSeries,Saturday Club,Saturday Club,0,2019,\N,3,"Animation,Family"
...,...,...,...,...,...,...,...,...,...
2050105,tt11903592,tvEpisode,Harvie vs. Hare,Harvie vs. Hare,0,2020,\N,22,"Comedy,Game-Show"
2050106,tt11903594,tvEpisode,Episode #16.101,Episode #16.101,0,\N,\N,0,"Comedy,Drama,Music"
2050107,tt11903596,tvEpisode,Episode #16.102,Episode #16.102,0,\N,\N,0,"Comedy,Drama,Music"
2050108,tt11903598,tvEpisode,Episode #16.104,Episode #16.104,0,\N,\N,0,"Comedy,Drama,Music"


In [31]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 2000110 to 2050109
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          50000 non-null  object
 1   titleType       50000 non-null  object
 2   primaryTitle    50000 non-null  object
 3   originalTitle   50000 non-null  object
 4   isAdult         50000 non-null  object
 5   startYear       50000 non-null  object
 6   endYear         50000 non-null  object
 7   runtimeMinutes  50000 non-null  object
 8   genres          50000 non-null  object
dtypes: object(9)
memory usage: 3.4+ MB


In [32]:
df2['runtimeMinutes'] = df2['runtimeMinutes'].astype(int)

In [33]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 2000110 to 2050109
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          50000 non-null  object
 1   titleType       50000 non-null  object
 2   primaryTitle    50000 non-null  object
 3   originalTitle   50000 non-null  object
 4   isAdult         50000 non-null  object
 5   startYear       50000 non-null  object
 6   endYear         50000 non-null  object
 7   runtimeMinutes  50000 non-null  int64 
 8   genres          50000 non-null  object
dtypes: int64(1), object(8)
memory usage: 3.4+ MB


# 3. describe()로 기초 통계량 파악(이상치 파악)

In [34]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [35]:
data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Dooley, Mr. Patrick",male,,,,347082.0,,G6,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


# 4. 결측값 찾고, 비율계산 후 대치/삭제하기
* 결측값 비율 계산: isna().sum() / len(데이터프레임) * 100
* 결측값 비율이 5% 미만: 행, 열을 제거, 분석에 크게 영향을 미치지 않음
* 결측값 비율이 5% ~ 30%: 결측값을 대체(Imputation)
  * 수치형 데이터(숫자형, 나이, 가격): 평균(mean), 중앙값(median), 최빈값(mode)으로 대체
  * 범주형 데이터(문자형/(숫자형), 선실등급, 탑승지: 최빈값(mode)으로 대체
* 결측값 비율이 30% ~ 50%: 컬럼의 중요도에 따라서 결측값을 대체 혹은 삭제
  * KNN(K-Nearest Neighbor, 최근접이웃) imputer, 회귀분석을 통해 결측값 대체
* 결측값 비율이 50% 이상: 해당 컬럼 삭제

In [39]:
round(data.isna().sum() / len(data) * 100,2)

PassengerId     0.00
Survived        0.00
Pclass          0.00
Name            0.00
Sex             0.00
Age            19.87
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.00
Cabin          77.10
Embarked        0.22
dtype: float64

In [42]:
# 결측값 비율이 77%인 Cabin은 분석에서 제외
data = data.drop('Cabin', axis=1)
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


# 결측 데이터의 종류
* MCAR(완전 무작위 결측): 다른 변수와 무관하게 생긴 결측(랜덤, 이유 없음)
  * 설문 조사중 일부 응답자가 무작위로 답변을 건너뜀
* MAR(무작위 결측): 다른 변수와는 관련 있지만 본인과는 무관
  * 나이가 많은 사람들이 소득란에 응답을 하지 않는 경우(나이와 관련)
  * 직업에 따라서 학력란에 응답하지 않는 경우(가정환경, 소득)
* NMAR(비무작위 결측): 변수 자기 자신과 직접 관련이 있는 경우
  * 소득이 낮은 사람이 자신의 소득을 숨기는 경우(소득과 결측이 직접 관련)
  * 체중이 많이 나가는 경우 자신의 체중을 숨기는 경우
  * 만족도가 낮은 고객이 만족도 조사에 응하지 않는 경우

# 결측치 처리방법
## 1) 단순대치법(simple imputaion)
#### (1) 완전분석: 결측값이 있는 모든 행을 삭제하고 완전한 자료만으로 분석(잘 안씀)
* 결측값을 삭제해도 모델을 만들기에 충분히 많은 데이터가 있는 경우
* 결측값을 삭제한 후에 데이터에 편향이 없다는 전제가 있을 때
* dropna(): 결측값이 있는 모든 행 삭제

#### (2) 평균 대치법: 결측치가 있는 컬럼에서 데이터의 평균을 구한 후 결측값을 대치
* 평균을 이용하기 때문에 간편
* 데이터에 이상치가 있을 경우 평균을 이용할 수 없다.
* 데이터에 이상치가 있는 경우 중앙값이나 최빈값을 이용해야 한다.

In [43]:
# 평균, 중앙값
a = pd.Series([24,5,10,34,20,18,28,23])
b = pd.Series([24,5,10,34,20,18,28,2000])

In [44]:
a

0    24
1     5
2    10
3    34
4    20
5    18
6    28
7    23
dtype: int64

In [45]:
b

0      24
1       5
2      10
3      34
4      20
5      18
6      28
7    2000
dtype: int64

In [46]:
# 평균
print(a.mean())
print(b.mean())

20.25
267.375


In [47]:
# 중앙값
print(a.median())
print(b.median())

21.5
22.0


* Age 컬럼의 결측값을 평균 대치법으로 대치

In [49]:
data['Age'].mean()

np.float64(29.69911764705882)

In [52]:
data['Age'].fillna(data['Age'].mean())

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [53]:
data['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [56]:
age_na_idx = data[data['Age'].isna()].index
age_na_idx

Index([  5,  17,  19,  26,  28,  29,  31,  32,  36,  42,
       ...
       832, 837, 839, 846, 849, 859, 863, 868, 878, 888],
      dtype='int64', length=177)

In [57]:
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [58]:
data.iloc[age_na_idx,:]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,29.699118,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,29.699118,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,29.699118,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,29.699118,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,29.699118,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,29.699118,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,29.699118,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,29.699118,0,0,349217,7.8958,S


In [60]:
data.loc[age_na_idx, 'Age'] = np.nan

In [61]:
data.loc[age_na_idx]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,S


* 중앙값으로 대치

In [62]:
data['Age'] = data['Age'].fillna(data['Age'].median)
data.loc[age_na_idx]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,<bound method Series.median of 0 22.0\n1 ...,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,<bound method Series.median of 0 22.0\n1 ...,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,<bound method Series.median of 0 22.0\n1 ...,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,<bound method Series.median of 0 22.0\n1 ...,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,<bound method Series.median of 0 22.0\n1 ...,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,<bound method Series.median of 0 22.0\n1 ...,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,<bound method Series.median of 0 22.0\n1 ...,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,<bound method Series.median of 0 22.0\n1 ...,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,<bound method Series.median of 0 22.0\n1 ...,0,0,349217,7.8958,S


In [64]:
data.loc[age_na_idx, 'Age'] = np.nan
data.loc[age_na_idx]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,S


# scikit-learn의 simple imputer를 이용한 대치

In [65]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
   ---------------------------------------- 0.0/11.2 MB ? eta -:--:--
   ------------ --------------------------- 3.4/11.2 MB 20.0 MB/s eta 0:00:01
   ------------------ --------------------- 5.2/11.2 MB 13.3 MB/s eta 0:00:01
   -------------------------- ------------- 7.3/11.2 MB 12.2 MB/s eta 0:00:01
   -------------------------------------- - 10.7/11.2 MB 13.2 MB/s eta 0:00:01
   ---------------------------------------- 11.2/11.2 MB 11.4 MB/s  0:00:01
Downloading joblib-1.5.2-py3-none-any.w

In [66]:
from sklearn.impute import SimpleImputer

In [73]:
imp_mean = SimpleImputer(strategy='mean')
imp_mean.fit_transform(data[['Age']])[:,0].shape

(891,)

In [74]:
imp_mean = SimpleImputer(strategy='mean')
data['Age'] = imp_mean.fit_transform(data[['Age']])[:,0]

In [75]:
data.loc[age_na_idx]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,29.699118,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,29.699118,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,29.699118,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,29.699118,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,29.699118,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,29.699118,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,29.699118,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,29.699118,0,0,349217,7.8958,S


In [76]:
data.loc[age_na_idx, 'Age'] = np.nan

In [77]:
# 최빈값으로 결측 값 처리
imp_most_frequent = SimpleImputer(strategy='most_frequent')
data['Age'] = imp_most_frequent.fit_transform(data[['Age']])[:,0]
data.loc[age_na_idx, 'Age']

5      24.0
17     24.0
19     24.0
26     24.0
28     24.0
       ... 
859    24.0
863    24.0
868    24.0
878    24.0
888    24.0
Name: Age, Length: 177, dtype: float64

# 결측값 비율이 30%-50%일 때 KNN 최근접 이웃법을 이용해 대치

In [78]:
from sklearn.impute import KNNImputer

In [79]:
knn_imp = KNNImputer(n_neighbors=5)

In [80]:
data['Age'] = knn_imp.fit_transform(data[['Age']])[:,0]
data.loc[age_na_idx]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,24.0,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,24.0,0,0,244373,13.0000,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,24.0,0,0,2649,7.2250,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,24.0,0,0,2631,7.2250,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,24.0,0,0,330959,7.8792,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,24.0,0,0,2629,7.2292,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,24.0,8,2,CA. 2343,69.5500,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,24.0,0,0,345777,9.5000,S
878,879,0,3,"Laleff, Mr. Kristo",male,24.0,0,0,349217,7.8958,S


In [81]:
data.loc[age_na_idx, 'Age'] = np.nan

# 깊은 복사, 얕은 복사

In [82]:
data2 = data

In [84]:
data2.loc[0, 'Name']="둘리"
data2

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,name
0,1,0,3,둘리,male,22.0,1,0,A/5 21171,7.2500,S,둘리
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S,
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S,
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S,
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,S,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,


In [85]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,name
0,1,0,3,둘리,male,22.0,1,0,A/5 21171,7.2500,S,둘리
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S,
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S,
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S,
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,S,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,


In [87]:
print("data의 메모리 주소: ", id(data))
print("data2의 메모리 주소: ", id(data2))

data의 메모리 주소:  2265731087568
data2의 메모리 주소:  2265731087568


In [88]:
data3 = data.copy()
print("data3의 메모리 주소: ", id(data3))

data3의 메모리 주소:  2268235003168


In [90]:
data3.loc[0, 'Name']="또치"

In [91]:
data3.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,name
0,1,0,3,또치,male,22.0,1,0,A/5 21171,7.25,S,둘리
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,


In [92]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,name
0,1,0,3,둘리,male,22.0,1,0,A/5 21171,7.25,S,둘리
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,
