# 사용할 라이브러리 로딩

In [None]:
import numpy as np # Numpy
import pandas as pd # Pandas
import matplotlib as mpl #Matplotlib 세팅용
import matplotlib.pyplot as plt # 시각화 도구
import seaborn as sns # 시각화 도구
from sklearn.model_selection import train_test_split # 데이터셋 분리
from sklearn.cluster import KMeans # 클러스터링
from sklearn.metrics import silhouette_score # 실루엣 점수
from xgboost import XGBClassifier  # XGBoostClassifier
import xgboost as xgb # XGBoost
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials # 최적의 파람
from imblearn.combine import * # 복합 샘플링
from sklearn.model_selection import GridSearchCV # 그리드 서치
from sklearn.model_selection import cross_val_score # 교차 스코어
from sklearn.metrics import accuracy_score, precision_score # 평가 지표
from sklearn.metrics import recall_score, confusion_matrix, roc_auc_score, f1_score # 평가 지표

import warnings # 경고문 제거용


%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

# 한글 폰트 설정
mpl.rc('font', family='D2Coding')
# 유니코드에서 음수 부호 설정
mpl.rc('axes', unicode_minus = False)

warnings.filterwarnings('ignore')
sns.set(font="D2Coding", rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc('figure', figsize=(10,8))

# 데이터 로딩

In [None]:
data = pd.read_csv('C:/Users/admin/Desktop/sparta/train.csv')

## 데이터 탐색

In [None]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [None]:
data.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## Cabin 결측값 처리
- 결측값 확인 후 우선 Cabin부터 채워넣기로 함
- PassengerId의 첫 4자리 숫자는 승객의 그룹을 의미함으로 Cabin 결측값 중 그룹이 있으면 그룹의 Cabin으로 채워넣음
- train뿐만 아니라 test에도 결측값이 있기에 같이 처리하기로 함
- 그렇게 처리하고 나온 파일이 하단의 파일

In [None]:
data = pd.read_excel('C:/Users/admin/Desktop/sparta/train_test_origin.xlsx')

In [None]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin1,Cabin2,Combi,Cabin3,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,0.0,B,0.0,B0,P,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0.0
1,0002_01,Earth,0.0,F,0.0,F0,S,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1.0
2,0003_01,Europa,0.0,A,0.0,A0,S,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0.0
3,0003_02,Europa,0.0,A,0.0,A0,S,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0.0
4,0004_01,Earth,0.0,F,1.0,F1,S,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1.0


In [None]:
# 결측값을 처리하면 Cabin을 나눠서 구분함
# Cabin은 섹터/방번호/측면으로 구분되어 있음
# Cabin1은 섹터
# Cabin2는 방번호
# Combi는 Cabin1+Cabin2
# Cabin3는 측면(P(ort)는 좌현, S(tarboard)는 우현)
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin1', 'Cabin2', 'Combi',
       'Cabin3', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported'],
      dtype='object')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12691 non-null  object 
 2   CryoSleep     12660 non-null  float64
 3   Cabin1        12804 non-null  object 
 4   Cabin2        12804 non-null  float64
 5   Combi         12804 non-null  object 
 6   Cabin3        12804 non-null  object 
 7   Cabin         12804 non-null  object 
 8   Destination   12704 non-null  object 
 9   Age           12700 non-null  float64
 10  VIP           12674 non-null  float64
 11  RoomService   12776 non-null  float64
 12  FoodCourt     12752 non-null  float64
 13  ShoppingMall  12760 non-null  float64
 14  Spa           12754 non-null  float64
 15  VRDeck        12766 non-null  float64
 16  Name          12676 non-null  object 
 17  Transported   8693 non-null   float64
dtypes: float64(10), object(8)


- test 데이터는 Kaggle에 제출해야하는 데이터이기에 target인 Transported가 전부 결측임

In [None]:
data.isna().sum()

PassengerId        0
HomePlanet       279
CryoSleep        310
Cabin1           166
Cabin2           166
Combi            166
Cabin3           166
Cabin            166
Destination      266
Age              270
VIP              296
RoomService      194
FoodCourt        218
ShoppingMall     210
Spa              216
VRDeck           204
Name             294
Transported     4277
dtype: int64

- 나머지 결측값을 처리하기 위해 클러스터링을 해보기로 함
- 클러스터링 하기 전에 전처리를 진행해야함

# 전처리

## 필요없는 feature 제거

In [None]:
# 분류하는데 필요없다고 예상되는 'PassengerId', 'Name' feature 제거
data.drop(['PassengerId', 'Name'], inplace=True, axis=1)

In [None]:
# 제거 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12691 non-null  object 
 1   CryoSleep     12660 non-null  float64
 2   Cabin1        12804 non-null  object 
 3   Cabin2        12804 non-null  float64
 4   Combi         12804 non-null  object 
 5   Cabin3        12804 non-null  object 
 6   Cabin         12804 non-null  object 
 7   Destination   12704 non-null  object 
 8   Age           12700 non-null  float64
 9   VIP           12674 non-null  float64
 10  RoomService   12776 non-null  float64
 11  FoodCourt     12752 non-null  float64
 12  ShoppingMall  12760 non-null  float64
 13  Spa           12754 non-null  float64
 14  VRDeck        12766 non-null  float64
 15  Transported   8693 non-null   float64
dtypes: float64(10), object(6)
memory usage: 1.6+ MB


## CryoSleep, VIP, Cabin3 boolean 타입으로 캐스팅

In [None]:
# Cabin3의 P(좌현)를 False으로 S(우현)를 True로 변경
data['Cabin3'].replace({'P': True,'S': False}, inplace=True)

In [None]:
# boolean으로 변환
data['CryoSleep'] = data['CryoSleep'].astype(bool)
data['VIP'] = data['VIP'].astype(bool)
data['Cabin3'] = data['Cabin3'].astype(bool)

In [None]:
# 변환 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12691 non-null  object 
 1   CryoSleep     12970 non-null  bool   
 2   Cabin1        12804 non-null  object 
 3   Cabin2        12804 non-null  float64
 4   Combi         12804 non-null  object 
 5   Cabin3        12970 non-null  bool   
 6   Cabin         12804 non-null  object 
 7   Destination   12704 non-null  object 
 8   Age           12700 non-null  float64
 9   VIP           12970 non-null  bool   
 10  RoomService   12776 non-null  float64
 11  FoodCourt     12752 non-null  float64
 12  ShoppingMall  12760 non-null  float64
 13  Spa           12754 non-null  float64
 14  VRDeck        12766 non-null  float64
 15  Transported   8693 non-null   float64
dtypes: bool(3), float64(8), object(5)
memory usage: 1.3+ MB


In [None]:
# target 값을 제외하고 다른 변수에 저장
df = data.iloc[:,:15]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12691 non-null  object 
 1   CryoSleep     12970 non-null  bool   
 2   Cabin1        12804 non-null  object 
 3   Cabin2        12804 non-null  float64
 4   Combi         12804 non-null  object 
 5   Cabin3        12970 non-null  bool   
 6   Cabin         12804 non-null  object 
 7   Destination   12704 non-null  object 
 8   Age           12700 non-null  float64
 9   VIP           12970 non-null  bool   
 10  RoomService   12776 non-null  float64
 11  FoodCourt     12752 non-null  float64
 12  ShoppingMall  12760 non-null  float64
 13  Spa           12754 non-null  float64
 14  VRDeck        12766 non-null  float64
dtypes: bool(3), float64(7), object(5)
memory usage: 1.2+ MB


## 클러스터링을 위해 다른 결측값들을 전부 제거

In [None]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11076 entries, 0 to 12969
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    11076 non-null  object 
 1   CryoSleep     11076 non-null  bool   
 2   Cabin1        11076 non-null  object 
 3   Cabin2        11076 non-null  float64
 4   Combi         11076 non-null  object 
 5   Cabin3        11076 non-null  bool   
 6   Cabin         11076 non-null  object 
 7   Destination   11076 non-null  object 
 8   Age           11076 non-null  float64
 9   VIP           11076 non-null  bool   
 10  RoomService   11076 non-null  float64
 11  FoodCourt     11076 non-null  float64
 12  ShoppingMall  11076 non-null  float64
 13  Spa           11076 non-null  float64
 14  VRDeck        11076 non-null  float64
dtypes: bool(3), float64(7), object(5)
memory usage: 1.1+ MB


## 원핫인코딩

In [None]:
# object 타입의 데이터들 더미화
train_encoding = pd.get_dummies(df['HomePlanet'])
df=df.drop('HomePlanet',axis=1)
df = df.join(train_encoding)

train_encoding = pd.get_dummies(df['Destination'])
# 기존의 팀명 컬러 삭제
df=df.drop('Destination',axis=1)
df = df.join(train_encoding)

train_encoding = pd.get_dummies(df['Cabin1'])
# 기존의 팀명 컬러 삭제
df=df.drop('Cabin1',axis=1)
df = df.join(train_encoding)

In [None]:
# 데이터 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11076 entries, 0 to 12969
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CryoSleep      11076 non-null  bool   
 1   Cabin2         11076 non-null  float64
 2   Combi          11076 non-null  object 
 3   Cabin3         11076 non-null  bool   
 4   Cabin          11076 non-null  object 
 5   Age            11076 non-null  float64
 6   VIP            11076 non-null  bool   
 7   RoomService    11076 non-null  float64
 8   FoodCourt      11076 non-null  float64
 9   ShoppingMall   11076 non-null  float64
 10  Spa            11076 non-null  float64
 11  VRDeck         11076 non-null  float64
 12  Earth          11076 non-null  uint8  
 13  Europa         11076 non-null  uint8  
 14  Mars           11076 non-null  uint8  
 15  55 Cancri e    11076 non-null  uint8  
 16  PSO J318.5-22  11076 non-null  uint8  
 17  TRAPPIST-1e    11076 non-null  uint8  
 18  A     

## 스케일링

In [None]:
# 스케일링을 위한 함수 생성
col = ['Cabin2', 'Age', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
def data_scaled(df, col):
    for i in col:
        data_mean = df[i].mean()
        data_std = df[i].std()
        scaled = (df[i]-data_mean)/data_std
        df[i]=scaled
    return df

In [None]:
data_scaled(df, col)

Unnamed: 0,CryoSleep,Cabin2,Combi,Cabin3,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,PSO J318.5-22,TRAPPIST-1e,A,B,C,D,E,F,G,T
0,False,-1.171058,B0,True,B/0/P,0.707877,False,-0.334616,-0.282674,-0.285975,...,0,1,0,1,0,0,0,0,0,0
1,False,-1.171058,F0,True,F/0/S,-0.329018,False,-0.166861,-0.277057,-0.244125,...,0,1,0,0,0,0,0,1,0,0
2,False,-1.171058,A0,True,A/0/S,2.021278,True,-0.268437,1.949128,-0.285975,...,0,1,1,0,0,0,0,0,0,0
3,False,-1.171058,A0,True,A/0/S,0.293119,False,-0.334616,0.518053,0.335083,...,0,1,1,0,0,0,0,0,0,0
4,False,-1.169112,F1,True,F/1/S,-0.882029,False,0.131712,-0.238987,-0.033199,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12962,True,1.737845,G1495,True,G/1495/S,0.984383,False,-0.334616,-0.282674,-0.285975,...,0,1,0,0,0,0,0,0,1,0
12963,False,-0.630138,D278,True,D/278/S,0.984383,False,-0.262281,-0.282674,6.160647,...,0,1,0,0,0,1,0,0,0,0
12964,False,2.323517,F1796,True,F/1796/S,0.777004,False,-0.334616,0.257177,-0.285975,...,0,1,0,0,0,0,0,1,0,0
12965,True,1.739790,G1496,True,G/1496/S,0.362246,False,-0.334616,-0.282674,-0.285975,...,0,1,0,0,0,0,0,0,1,0


In [None]:
df.head()

Unnamed: 0,CryoSleep,Cabin2,Combi,Cabin3,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,PSO J318.5-22,TRAPPIST-1e,A,B,C,D,E,F,G,T
0,False,-1.171058,B0,True,B/0/P,0.707877,False,-0.334616,-0.282674,-0.285975,...,0,1,0,1,0,0,0,0,0,0
1,False,-1.171058,F0,True,F/0/S,-0.329018,False,-0.166861,-0.277057,-0.244125,...,0,1,0,0,0,0,0,1,0,0
2,False,-1.171058,A0,True,A/0/S,2.021278,True,-0.268437,1.949128,-0.285975,...,0,1,1,0,0,0,0,0,0,0
3,False,-1.171058,A0,True,A/0/S,0.293119,False,-0.334616,0.518053,0.335083,...,0,1,1,0,0,0,0,0,0,0
4,False,-1.169112,F1,True,F/1/S,-0.882029,False,0.131712,-0.238987,-0.033199,...,0,1,0,0,0,0,0,1,0,0


## 클러스터링
- 필요없는 feature 추가로 제거한 후 진행

In [None]:
dt = df.drop(['Combi', 'Cabin'], axis=1)

In [None]:
k_range = range(2,30)

best_k = -1
best_silhouette_score = -1

for k in k_range:
    km = KMeans(n_clusters = k, random_state=109)
    km.fit(dt)
    clusters= km.predict(dt)
    
    score = silhouette_score(dt, clusters)
    
    print('k: {}, score: {}'.format(k, score))
    if score > best_silhouette_score:
        best_k = k
        best_silhouette_score = score
        
print('\n best K: {}, best Score: {}'.format(best_k, best_silhouette_score))

k: 2, score: 0.39969018335468287
k: 3, score: 0.14185156575717311
k: 4, score: 0.15151784045362093
k: 5, score: 0.1559061192440089
k: 6, score: 0.13272243195725927
k: 7, score: 0.14264793143217322
k: 8, score: 0.1523939867296424
k: 9, score: 0.15855572438088006
k: 10, score: 0.16848717649649886
k: 11, score: 0.1719417766830233
k: 12, score: 0.1767525504160753
k: 13, score: 0.15301345015328918
k: 14, score: 0.16995625293762828
k: 15, score: 0.17547035906816716
k: 16, score: 0.15851569352600384
k: 17, score: 0.16866166334971966
k: 18, score: 0.17945282042894817
k: 19, score: 0.1782550670939478
k: 20, score: 0.17939509762456127
k: 21, score: 0.17489397195225823
k: 22, score: 0.17714857616304974
k: 23, score: 0.1736955166982367
k: 24, score: 0.1745671601499613
k: 25, score: 0.17714199923930266
k: 26, score: 0.17156985331968433
k: 27, score: 0.17462637551984989
k: 28, score: 0.17879420423015113
k: 29, score: 0.18481786330498226

 best K: 2, best Score: 0.39969018335468287


- 군집화를 시켜보니 실루엣 점수가 너무 낮아서 이를 포기하고 feature 별로 분류하여 결측값을 채우기로 함

## CryoSleep 결측값 채우기
- XGBoost 활용하기

### 훈련셋 테스트셋 검증셋 분리

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dt.drop(['CryoSleep'],axis=1),dt.CryoSleep,
                                                    random_state=109)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

### 모델 생성(그리드서치)

In [None]:
# xgbo = xgb.XGBClassifier()

# params = {
#     'max_depth':[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, None],
#     'learning_rate':[0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4],
#     'gamma':[0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5],
#     'random_state':[109]
# }

# gs = GridSearchCV(xgbo, param_grid = params, cv = 3, refit = True, n_jobs=-1)
# gs.fit(X_train, y_train)

### 결과확인

In [None]:
# model = gs.best_estimator_
# print(model.score(X_train, y_train))
# print(model.score(X_test, y_test))
# print(model.score(X_val, y_val))

In [None]:
# # 최적의 파라미터값
# print(gs.best_params_)

In [None]:
xgbo = xgb.XGBClassifier(gamma=4, learning_rate=0.3, max_depth=11, random_state=109)
xgbo.fit(X_train, y_train)

In [None]:
train_pred = xgbo.predict(X_train)
train_proba = xgbo.predict_proba(X_train)

test_pred = xgbo.predict(X_test)
test_proba = xgbo.predict_proba(X_test)

val_pred = xgbo.predict(X_val)
val_proba = xgbo.predict_proba(X_val)

In [None]:
# 평가용 함수
def  get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
#     roc_auc = roc_auc_score(y_test, pred_proba)
    
    print('오차 행렬')
    print(confusion)
 
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, \
    재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))

#### 훈련셋 평가

In [None]:
get_clf_eval(y_train, train_pred, train_proba)

오차 행렬
[[3550  199]
 [ 146 2335]]
정확도: 0.9446, 정밀도: 0.9215,     재현율: 0.9412, F1: 0.9312


#### 테스트셋 평가

In [None]:
get_clf_eval(y_test, test_pred, test_proba)

오차 행렬
[[1592   94]
 [  73 1010]]
정확도: 0.9397, 정밀도: 0.9149,     재현율: 0.9326, F1: 0.9236


#### 검증셋 평가

In [None]:
get_clf_eval(y_val, val_pred, val_proba)

오차 행렬
[[1223   61]
 [  56  737]]
정확도: 0.9437, 정밀도: 0.9236,     재현율: 0.9294, F1: 0.9265


- CryoSleep XGBoost 모델은 0.93이상의 정확도와 다른 지표도 좋아 결측값 예측에 사용하기로 함

# HomePlanet 결측치 채우기
## 사본 df생성

In [None]:
# target 값을 제외하고 다른 변수에 저장
df_hp = data.iloc[:,:15]
# 그 외 사용하지 않는 피처 삭제
df_hp = df_hp.drop(['Cabin','Combi','Age'], axis=1)

In [None]:
df_hp

Unnamed: 0,HomePlanet,CryoSleep,Cabin1,Cabin2,Cabin3,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,B,0.0,True,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,F,0.0,False,TRAPPIST-1e,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,A,0.0,False,TRAPPIST-1e,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,A,0.0,False,TRAPPIST-1e,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,F,1.0,False,TRAPPIST-1e,False,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
12965,Earth,True,G,1496.0,False,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,0.0
12966,Earth,False,,,True,TRAPPIST-1e,False,0.0,847.0,17.0,10.0,144.0
12967,Mars,True,D,296.0,True,55 Cancri e,False,0.0,0.0,0.0,0.0,0.0
12968,Europa,False,D,297.0,True,,False,0.0,2680.0,0.0,0.0,523.0


In [None]:
df_hp.dropna(inplace=True)
df_hp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11317 entries, 0 to 12969
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    11317 non-null  object 
 1   CryoSleep     11317 non-null  bool   
 2   Cabin1        11317 non-null  object 
 3   Cabin2        11317 non-null  float64
 4   Cabin3        11317 non-null  bool   
 5   Destination   11317 non-null  object 
 6   VIP           11317 non-null  bool   
 7   RoomService   11317 non-null  float64
 8   FoodCourt     11317 non-null  float64
 9   ShoppingMall  11317 non-null  float64
 10  Spa           11317 non-null  float64
 11  VRDeck        11317 non-null  float64
dtypes: bool(3), float64(6), object(3)
memory usage: 917.3+ KB


In [None]:
# homeplanet이 타깃으로 할거라서 홈플레닛 값을 0,1,2로 치환하여 사용
df_hp.HomePlanet = df_hp['HomePlanet'].map({'Earth':0, 'Europa':1, 'Mars':2})

In [None]:
df_hp.HomePlanet.unique() # 변경 확인

array([1, 0, 2], dtype=int64)

## 원-핫 인코딩

In [None]:
# 원-핫 인코딩 (cabin1, destination)
## Cabin1
encoding = pd.get_dummies(df_hp.Cabin1)
df_hp = df_hp.drop('Cabin1', axis =1) # 기존 삭제
df_hp = df_hp.join(encoding) # 적용
## Destination
encoding = pd.get_dummies(df_hp.Destination)
df_hp = df_hp.drop('Destination', axis =1)
df_hp = df_hp.join(encoding)

In [None]:
df_hp.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin2,Cabin3,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,B,C,D,E,F,G,T,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,1,False,0.0,True,False,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,1
1,0,False,0.0,False,False,109.0,9.0,25.0,549.0,44.0,...,0,0,0,0,1,0,0,0,0,1
2,1,False,0.0,False,True,43.0,3576.0,0.0,6715.0,49.0,...,0,0,0,0,0,0,0,0,0,1
3,1,False,0.0,False,False,0.0,1283.0,371.0,3329.0,193.0,...,0,0,0,0,0,0,0,0,0,1
4,0,False,1.0,False,False,303.0,70.0,151.0,565.0,2.0,...,0,0,0,0,1,0,0,0,0,1


In [None]:
df_hp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11317 entries, 0 to 12969
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   HomePlanet     11317 non-null  int64  
 1   CryoSleep      11317 non-null  bool   
 2   Cabin2         11317 non-null  float64
 3   Cabin3         11317 non-null  bool   
 4   VIP            11317 non-null  bool   
 5   RoomService    11317 non-null  float64
 6   FoodCourt      11317 non-null  float64
 7   ShoppingMall   11317 non-null  float64
 8   Spa            11317 non-null  float64
 9   VRDeck         11317 non-null  float64
 10  A              11317 non-null  uint8  
 11  B              11317 non-null  uint8  
 12  C              11317 non-null  uint8  
 13  D              11317 non-null  uint8  
 14  E              11317 non-null  uint8  
 15  F              11317 non-null  uint8  
 16  G              11317 non-null  uint8  
 17  T              11317 non-null  uint8  
 18  55 Can

## 스케일링

In [None]:
# 위에 정의된 스케일링 함수 호출
col = ['Cabin2', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

data_scaled(df_hp, col)

Unnamed: 0,HomePlanet,CryoSleep,Cabin2,Cabin3,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,B,C,D,E,F,G,T,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,1,False,-1.171813,True,False,-0.334680,-0.281438,-0.285053,-0.270872,-0.260036,...,1,0,0,0,0,0,0,0,0,1
1,0,False,-1.171813,False,False,-0.166509,-0.275840,-0.243219,0.215668,-0.220589,...,0,0,0,0,1,0,0,0,0,1
2,1,False,-1.171813,False,True,-0.268338,1.942578,-0.285053,5.680162,-0.216106,...,0,0,0,0,0,0,0,0,0,1
3,1,False,-1.171813,False,False,-0.334680,0.516496,0.335766,2.679387,-0.087005,...,0,0,0,0,0,0,0,0,0,1
4,0,False,-1.169865,False,False,0.132804,-0.237903,-0.032375,0.229848,-0.258243,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12963,2,False,-0.630313,False,False,-0.262166,-0.281438,6.159085,-0.270872,-0.260036,...,0,0,1,0,0,0,0,0,0,1
12964,0,False,2.326507,False,False,-0.334680,0.256530,-0.285053,-0.268214,-0.260036,...,0,0,0,0,1,0,0,0,0,1
12965,0,True,1.742155,False,False,-0.334680,-0.281438,-0.285053,-0.270872,-0.260036,...,0,0,0,0,0,1,0,0,0,1
12967,2,True,-0.595252,True,False,-0.334680,-0.281438,-0.285053,-0.270872,-0.260036,...,0,0,1,0,0,0,0,1,0,0


## 모델링_랜덤포레스트

In [None]:
# 데이터 타깃 분리
hp_data = df_hp.drop('HomePlanet', axis=1)
hp_label = df_hp['HomePlanet']

In [None]:
from imblearn.combine import *

In [None]:
# 임시 모델 설정
rfc = RandomForestClassifier()
# 그리드 서치
#grid = {
#    'n_estimators': [50,90,100,150,200, 250],
#    'max_depth': [3,5,7,9,13,15],
#    'min_samples_leaf':[3,5,7,9,13,15],
#    'min_samples_split': [3,5,7,9,13,15]
#}
# 그리드 객체
#rfc_grid = GridSearchCV(rfc, param_grid = grid, scoring = 'accuracy', cv=5, n_jobs=-1, 
#                       verbose =1)
# fitting
#rfc_grid.fit(hp_data, hp_label)
#print('최고 평균 정확도 : {}'.format(rfc_grid.best_score_))
#print('최고 파라미터: {}', rfc_grid.best_params_)

In [None]:
# 최적의 파라미터를 이용한 모델링
rfc_model = RandomForestClassifier(n_estimators=200, max_depth=15, 
                                   min_samples_leaf=3, min_samples_split=9, random_state=109)

# fitting
rfc_model.fit(hp_data, hp_label)

# pred 
rfc_pred = rfc_model.predict(hp_data)
print('정확도 : ', accuracy_score(hp_label, rfc_pred))

정확도 :  0.9621807899620041


In [None]:
# 혼돈행렬
confusion_matrix(hp_label, rfc_pred)

array([[5919,    5,  130],
       [  12, 2800,   45],
       [ 180,   56, 2170]], dtype=int64)

In [None]:
2170/(2170+56+180)

0.9019118869492935

### 세트 분리 후 모델링

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
df_hp.drop('HomePlanet', axis =1),df_hp['HomePlanet'], random_state =109) 

In [None]:
rf_clf = RandomForestClassifier()
grid = {
    'n_estimators': [50,90,100,150,200, 250],
    'max_depth': [3,5,7,9,13,15],
    'min_samples_leaf':[3,5,7,9,13,15],
    'min_samples_split': [3,5,7,9,13,15]}

clf_grid = GridSearchCV(rf_clf, param_grid = grid, scoring='accuracy', verbose=1, cv= 5, n_jobs=-1)

clf_grid.fit(X_train, y_train)


Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


In [None]:
print('최고 평균 정확도 : {}'.format(clf_grid.best_score_))
print('최고 파라미터: {}', clf_grid.best_params_)

최고 평균 정확도 : 0.937316250599513
최고 파라미터: {} {'max_depth': 13, 'min_samples_leaf': 3, 'min_samples_split': 7, 'n_estimators': 200}


In [None]:
# 최적의 파라미터를 이용한 모델링
clf_model = RandomForestClassifier(n_estimators=150, max_depth=15, 
                                   min_samples_leaf=3, min_samples_split=5, random_state=109)

# fitting
clf_model.fit(X_train, y_train)

# pred 
clf_pred = clf_model.predict(X_test)
print('정확도 : ', accuracy_score(y_test, clf_pred))

정확도 :  0.9406360424028268


In [None]:
confusion_matrix(y_test, clf_pred)

array([[1481,    1,   47],
       [  11,  671,   15],
       [  78,   16,  510]], dtype=int64)

In [None]:
# Mars의 예측 정확도가 다소 낮은걸 확인
510/(78+16+510)

0.8443708609271523

## 모델링_XGBoost

In [None]:
# hyperopt 적용을 위한 설정 값 담기
xgb_search_space = {'max_depth': hp.quniform('max_depth', 5, 20, 1),
                    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
               }

In [None]:
# 실행을 위한 함수 정의
def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'], 
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)
    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}


In [None]:
# 최적의 파라미터 찾기
trial_val = Trials()
best = fmin(fn=objective_func, # 정의한 실행 함수
            space=xgb_search_space, # 하이퍼 오피티 설정값 정의한거
            algo=tpe.suggest,
            max_evals=50,
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best:', best)

100%|███████████████████████████████████████████████| 50/50 [02:32<00:00,  3.04s/trial, best loss: -0.9408507128549547]
best: {'colsample_bytree': 0.6385285878314475, 'learning_rate': 0.16316252418744445, 'max_depth': 12.0, 'min_child_weight': 1.0}


In [None]:
# 모델링
xgb_model = XGBClassifier(n_estimators=200, learning_rate=round(best['learning_rate'], 5), 
                            max_depth=int(best['max_depth']), min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5)
                           )
xgb_model.fit(X_train, y_train)
pred= xgb_model.predict(X_test)
print('정확도 : ', accuracy_score(y_test, pred))
print('혼돈행렬 : \n', confusion_matrix(y_test, pred))

정확도 :  0.9452296819787986
혼돈행렬 : 
 [[1468    4   57]
 [   5  678   14]
 [  62   13  529]]


In [None]:
# Mars의 정확도
529/(62+13+529)*100

87.58278145695364

## 결측값 채우기
### 결측치 파일 불러오기

In [None]:
df = pd.read_excel('C:/Users/admin/Desktop/sparta/hp_na.xlsx')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin1,Cabin2,Combi,Cabin3,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0064_02,,True,E,3.0,E3,S,E/3/S,TRAPPIST-1e,33.0,0.0,0.0,0.0,0.0,0.0,0.0,Colatz Keen,1.0
1,0119_01,,False,A,0.0,A0,P,A/0/P,TRAPPIST-1e,39.0,0.0,0.0,2344.0,0.0,65.0,6898.0,Batan Coning,0.0
2,0210_01,,True,D,6.0,D6,P,D/6/P,55 Cancri e,24.0,0.0,0.0,0.0,0.0,0.0,0.0,Arraid Inicont,1.0
3,0242_01,,False,F,46.0,F46,S,F/46/S,TRAPPIST-1e,18.0,0.0,313.0,1.0,691.0,283.0,0.0,Almone St챕,0.0
4,0251_01,,True,C,11.0,C11,S,C/11/S,55 Cancri e,54.0,0.0,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,1.0


In [None]:
data = df.drop(['Transported', 'Name', 'Age', 'Cabin','Combi'], axis=1)

### 원-핫 인코딩, 형변환, 스케일링

In [None]:
# Cabin1
encode = pd.get_dummies(data['Cabin1'])
data.drop('Cabin1', axis=1)
data = data.join(encode)
# Destination
encode = pd.get_dummies(data['Destination'])
data.drop(['Destination'], axis=1)
data = data.join(encode)

In [None]:
data['Cabin3'] = data['Cabin3'].map({'P':0, 'S':1})
data['HomePlanet'] = data['HomePlanet'].map({'Earth':0, 'Europa':1, 'Mars':2})

In [None]:
data_scaled(data, col)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin1,Cabin2,Cabin3,Destination,VIP,RoomService,FoodCourt,...,B,C,D,E,F,G,T,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,0064_02,,True,E,-1.185692,1.0,TRAPPIST-1e,0.0,-0.458634,-0.256582,...,0,0,0,1,0,0,0,0,0,1
1,0119_01,,False,A,-1.191345,0.0,TRAPPIST-1e,0.0,-0.458634,1.610934,...,0,0,0,0,0,0,0,0,0,1
2,0210_01,,True,D,-1.180040,0.0,55 Cancri e,0.0,-0.458634,-0.256582,...,0,0,1,0,0,0,0,1,0,0
3,0242_01,,False,F,-1.104674,1.0,TRAPPIST-1e,0.0,0.237424,-0.255785,...,0,0,0,0,1,0,0,0,0,1
4,0251_01,,True,C,-1.170619,1.0,55 Cancri e,0.0,-0.458634,-0.256582,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,8621_01,,False,E,-0.151302,0.0,TRAPPIST-1e,0.0,-0.449739,-0.256582,...,0,0,0,1,0,0,0,0,0,1
275,8678_01,,True,G,1.444561,1.0,55 Cancri e,0.0,-0.458634,-0.256582,...,0,0,0,0,0,1,0,1,0,0
276,8775_01,,True,D,-0.673207,0.0,TRAPPIST-1e,0.0,-0.458634,-0.256582,...,0,0,1,0,0,0,0,0,0,1
277,9025_01,,False,G,1.548189,1.0,TRAPPIST-1e,0.0,-0.458634,-0.256582,...,0,0,0,0,0,1,0,0,0,1


In [None]:
dt = data.drop(['PassengerId', 'HomePlanet','Destination', 'Cabin1'], axis = 1)
label = data['HomePlanet']

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CryoSleep      279 non-null    bool   
 1   Cabin2         274 non-null    float64
 2   Cabin3         274 non-null    float64
 3   VIP            276 non-null    float64
 4   RoomService    275 non-null    float64
 5   FoodCourt      276 non-null    float64
 6   ShoppingMall   275 non-null    float64
 7   Spa            272 non-null    float64
 8   VRDeck         277 non-null    float64
 9   A              279 non-null    uint8  
 10  B              279 non-null    uint8  
 11  C              279 non-null    uint8  
 12  D              279 non-null    uint8  
 13  E              279 non-null    uint8  
 14  F              279 non-null    uint8  
 15  G              279 non-null    uint8  
 16  T              279 non-null    uint8  
 17  55 Cancri e    279 non-null    uint8  
 18  PSO J318.5

### 결측값 예측하기

In [None]:
df['predict_hp']= xgb_model.predict(dt)

In [None]:
df['predict_hp'] = df['predict_hp'].map({0:'Earth', 1:'Europa', 2:'Mars'})

In [None]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin1,Cabin2,Combi,Cabin3,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,predict_hp
0,0064_02,,True,E,3.0,E3,S,E/3/S,TRAPPIST-1e,33.0,0.0,0.0,0.0,0.0,0.0,0.0,Colatz Keen,1.0,Earth
1,0119_01,,False,A,0.0,A0,P,A/0/P,TRAPPIST-1e,39.0,0.0,0.0,2344.0,0.0,65.0,6898.0,Batan Coning,0.0,Europa
2,0210_01,,True,D,6.0,D6,P,D/6/P,55 Cancri e,24.0,0.0,0.0,0.0,0.0,0.0,0.0,Arraid Inicont,1.0,Europa
3,0242_01,,False,F,46.0,F46,S,F/46/S,TRAPPIST-1e,18.0,0.0,313.0,1.0,691.0,283.0,0.0,Almone St챕,0.0,Mars
4,0251_01,,True,C,11.0,C11,S,C/11/S,55 Cancri e,54.0,0.0,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,1.0,Europa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,8621_01,,False,E,552.0,E552,P,E/552/P,TRAPPIST-1e,19.0,0.0,4.0,0.0,1604.0,0.0,0.0,Vanley Simmonders,,Mars
275,8678_01,,True,G,1399.0,G1399,S,G/1399/S,55 Cancri e,9.0,0.0,0.0,0.0,0.0,0.0,0.0,Eilan Kellson,,Earth
276,8775_01,,True,D,275.0,D275,P,D/275/P,TRAPPIST-1e,40.0,0.0,0.0,0.0,0.0,0.0,0.0,Raston Maltorted,,Europa
277,9025_01,,False,G,1454.0,G1454,S,G/1454/S,TRAPPIST-1e,42.0,0.0,0.0,0.0,28.0,726.0,0.0,Ale Whitersone,,Earth


In [None]:
df.to_excel('C:/Users/admin/Desktop/sparta/hp_fill(cabin).xlsx')

## 복합 샘플링

In [None]:
# 샘플 편향
np.unique(hp_label, return_counts = True)

(array([0, 1, 2], dtype=int64), array([6054, 2857, 2406], dtype=int64))

In [None]:
# 복합 샘플링
X_samp, y_samp = SMOTEENN(random_state=109).fit_resample(hp_data, hp_label)

In [None]:
# 샘플링 완료
np.unique(y_samp, return_counts = True)

(array([0, 1, 2], dtype=int64), array([5237, 5740, 5109], dtype=int64))

### 세트 분리

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X_samp, y_samp, random_state = 109 )

### XGBClassifier & hyperopt

In [None]:
# 실행을 위한 함수 정의
def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'], 
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)
    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}


In [None]:
# 최적의 파라미터 찾기
trial_val = Trials()
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best:', best)

100%|███████████████████████████████████████████████| 50/50 [03:02<00:00,  3.65s/trial, best loss: -0.9915450927220193]
best: {'colsample_bytree': 0.7397666727597565, 'learning_rate': 0.176738407962117, 'max_depth': 11.0, 'min_child_weight': 1.0}


In [None]:
# 모델링
xgb_model = XGBClassifier(n_estimators=200, learning_rate=round(best['learning_rate'], 5), 
                            max_depth=int(best['max_depth']), min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5)
                           )
xgb_model.fit(X_train, y_train)
pred= xgb_model.predict(X_test)
print('정확도 : ', accuracy_score(y_test, pred))
print('혼돈행렬 : \n', confusion_matrix(y_test, pred))

정확도 :  0.9947787170561909
혼돈행렬 : 
 [[1322    0   13]
 [   1 1409    1]
 [   5    1 1270]]


In [None]:
# Mars의 정확도
1270/(5+1+1270)*100

99.52978056426332

### 결측값 예측 하기

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   279 non-null    object 
 1   HomePlanet    0 non-null      float64
 2   CryoSleep     279 non-null    bool   
 3   Cabin1        274 non-null    object 
 4   Cabin2        274 non-null    float64
 5   Combi         274 non-null    object 
 6   Cabin3        274 non-null    object 
 7   Cabin         274 non-null    object 
 8   Destination   273 non-null    object 
 9   Age           274 non-null    float64
 10  VIP           276 non-null    float64
 11  RoomService   275 non-null    float64
 12  FoodCourt     276 non-null    float64
 13  ShoppingMall  275 non-null    float64
 14  Spa           272 non-null    float64
 15  VRDeck        277 non-null    float64
 16  Name          269 non-null    object 
 17  Transported   192 non-null    float64
dtypes: bool(1), float64(10), objec

In [None]:
# 예측값
df['sampling_pred_hp'] = xgb_model.predict(dt)
# 원본 형태로 되돌리기
df['sampling_pred_hp'] = df['sampling_pred_hp'].map({0:'Earth', 1:'Europa', 2:'Mars'})

In [None]:
np.unique(df['sampling_pred_hp'], return_counts = True)

(array(['Earth', 'Europa', 'Mars'], dtype=object),
 array([140,  67,  72], dtype=int64))

In [None]:
#엑셀로 저장하기
df.to_excel('C:/Users/admin/Desktop/sparta/hp_fill(sampling).xlsx')

# Destination 결측치 판단

In [None]:
# 다른값 결측치 채운 데이터프레임 불러오기
df_dt = pd.read_excel('C:/Users/admin/Desktop/sparta/train_test_filled_hp_cs_vp.xlsx')
df_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12970 non-null  object 
 2   CryoSleep     12970 non-null  bool   
 3   Cabin1        12804 non-null  object 
 4   Cabin2        12804 non-null  float64
 5   Combi         12804 non-null  object 
 6   Cabin3        12804 non-null  object 
 7   Cabin         12804 non-null  object 
 8   Destination   12704 non-null  object 
 9   Age           12700 non-null  float64
 10  VIP           12970 non-null  bool   
 11  RoomService   12776 non-null  float64
 12  FoodCourt     12752 non-null  float64
 13  ShoppingMall  12760 non-null  float64
 14  Spa           12754 non-null  float64
 15  VRDeck        12766 non-null  float64
 16  Name          12676 non-null  object 
 17  Transported   8693 non-null   float64
dtypes: bool(2), float64(8), ob

In [None]:
df_dt.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin1,Cabin2,Combi,Cabin3,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B,0.0,B0,P,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0.0
1,0002_01,Earth,False,F,0.0,F0,S,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1.0
2,0003_01,Europa,False,A,0.0,A0,S,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0.0
3,0003_02,Europa,False,A,0.0,A0,S,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0.0
4,0004_01,Earth,False,F,1.0,F1,S,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1.0


In [None]:
df_dt[df_dt['CryoSleep'].isnull()] = 0.0

In [None]:
df_dt.isnull().sum()

PassengerId        0
HomePlanet         0
CryoSleep          0
Cabin1           166
Cabin2           166
Combi            166
Cabin3           166
Cabin            166
Destination      266
Age              270
VIP                0
RoomService      194
FoodCourt        218
ShoppingMall     210
Spa              216
VRDeck           204
Name             294
Transported     4277
dtype: int64

In [None]:
df_dt = df_dt.drop(['PassengerId', 'Name','Transported','Cabin','Combi', 'Age'], axis =1)
df_dt.info() # 채울 셋

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12970 non-null  object 
 1   CryoSleep     12970 non-null  object 
 2   Cabin1        12804 non-null  object 
 3   Cabin2        12804 non-null  float64
 4   Cabin3        12804 non-null  object 
 5   Destination   12704 non-null  object 
 6   VIP           12970 non-null  object 
 7   RoomService   12776 non-null  float64
 8   FoodCourt     12752 non-null  float64
 9   ShoppingMall  12760 non-null  float64
 10  Spa           12754 non-null  float64
 11  VRDeck        12766 non-null  float64
dtypes: float64(6), object(6)
memory usage: 1.2+ MB


In [None]:
x= df_dt.dropna(axis=0)
x.info() # train용 데이터 완성

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11565 entries, 0 to 12969
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    11565 non-null  object 
 1   CryoSleep     11565 non-null  object 
 2   Cabin1        11565 non-null  object 
 3   Cabin2        11565 non-null  float64
 4   Cabin3        11565 non-null  object 
 5   Destination   11565 non-null  object 
 6   VIP           11565 non-null  object 
 7   RoomService   11565 non-null  float64
 8   FoodCourt     11565 non-null  float64
 9   ShoppingMall  11565 non-null  float64
 10  Spa           11565 non-null  float64
 11  VRDeck        11565 non-null  float64
dtypes: float64(6), object(6)
memory usage: 1.1+ MB


## 원-핫 인코딩

In [None]:
# cabin1, homeplanet
encode = pd.get_dummies(x['Cabin1'])
x = x.drop('Cabin1', axis =1)
x = x.join(encode)
encode = pd.get_dummies(x['HomePlanet'])
x = x.drop('HomePlanet', axis =1)
x = x.join(encode)

In [None]:
x['CryoSleep'].value_counts()

False    7283
True     4282
Name: CryoSleep, dtype: int64

In [None]:
8070 / (8070+2417+1078)

0.6977950713359273

In [None]:
x['Destination'].value_counts()

TRAPPIST-1e      8070
55 Cancri e      2417
PSO J318.5-22    1078
Name: Destination, dtype: int64

In [None]:
x['Cabin3'].replace({'P': True,'S': False}, inplace=True)
x['Destination'] = x['Destination'].map({
    'TRAPPIST-1e':0, '55 Cancri e':1, 'PSO J318.5-22':2})

In [None]:
# boolean으로 변환
x['CryoSleep'] = x['CryoSleep'].astype(bool)
x['VIP'] = x['VIP'].astype(bool)
x['Cabin3'] = x['Cabin3'].astype(bool)
x['Destination'] = x['Destination'].astype(int)

In [None]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11565 entries, 0 to 12969
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CryoSleep     11565 non-null  bool   
 1   Cabin2        11565 non-null  float64
 2   Cabin3        11565 non-null  bool   
 3   Destination   11565 non-null  int32  
 4   VIP           11565 non-null  bool   
 5   RoomService   11565 non-null  float64
 6   FoodCourt     11565 non-null  float64
 7   ShoppingMall  11565 non-null  float64
 8   Spa           11565 non-null  float64
 9   VRDeck        11565 non-null  float64
 10  A             11565 non-null  uint8  
 11  B             11565 non-null  uint8  
 12  C             11565 non-null  uint8  
 13  D             11565 non-null  uint8  
 14  E             11565 non-null  uint8  
 15  F             11565 non-null  uint8  
 16  G             11565 non-null  uint8  
 17  T             11565 non-null  uint8  
 18  Earth         11565 non-nu

In [None]:
x['Destination'].value_counts()

0    8070
1    2417
2    1078
Name: Destination, dtype: int64

## 스케일링

In [None]:
col = ['Cabin2', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

data_scaled(x, col)

Unnamed: 0,CryoSleep,Cabin2,Cabin3,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,B,C,D,E,F,G,T,Earth,Europa,Mars
0,False,-1.172239,True,0,False,-0.336424,-0.280619,-0.286750,-0.270668,-0.260993,...,1,0,0,0,0,0,0,0,1,0
1,False,-1.172239,False,0,False,-0.167339,-0.275001,-0.244668,0.216287,-0.221524,...,0,0,0,0,1,0,0,1,0,0
2,False,-1.172239,False,0,True,-0.269721,1.951716,-0.286750,5.685434,-0.217039,...,0,0,0,0,0,0,0,0,1,0
3,False,-1.172239,False,0,False,-0.336424,0.520300,0.337736,2.682104,-0.087867,...,0,0,0,0,0,0,0,0,1,0
4,False,-1.170293,False,0,False,0.133602,-0.236921,-0.032579,0.230478,-0.259199,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12963,False,-0.631383,False,0,False,-0.263516,-0.280619,6.195440,-0.270668,-0.260993,...,0,0,1,0,0,0,0,0,0,1
12964,False,2.321923,False,0,False,-0.336424,0.259361,-0.286750,-0.268007,-0.260993,...,0,0,0,0,1,0,0,1,0,0
12965,True,1.738266,False,0,False,-0.336424,-0.280619,-0.286750,-0.270668,-0.260993,...,0,0,0,0,0,1,0,1,0,0
12967,True,-0.596364,True,1,False,-0.336424,-0.280619,-0.286750,-0.270668,-0.260993,...,0,0,1,0,0,0,0,0,0,1


## 복합 샘플링

In [None]:
x_tr = x.drop('Destination', axis =1) 
x_lb = x['Destination']

In [None]:
# 복합 샘플링
X_sample, y_sample = SMOTEENN(random_state =109).fit_resample(x_tr, x_lb)

In [None]:
y_sample.value_counts() # 아직 편향이 좀 있지만, 나아졌네

2    4634
1    3841
0    2406
Name: Destination, dtype: int64

## 세트분리

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, random_state=109)

## 모델링

In [None]:
# hyperopt 적용을 위한 설정 값 담기
xgb_search_space = {'max_depth': hp.quniform('max_depth', 5, 20, 1),
                    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.95)
               }

In [None]:
# 실행을 위한 함수 정의
def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'], 
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)
    return {'loss':-1 * np.mean(accuracy), 'status': STATUS_OK}

In [None]:
# 최적의 파라미터 찾기
trial_val = Trials()
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best:', best)

100%|███████████████████████████████████████████████| 50/50 [03:51<00:00,  4.63s/trial, best loss: -0.9030637254901962]
best: {'colsample_bytree': 0.8239686401837011, 'learning_rate': 0.18147134837616583, 'max_depth': 17.0, 'min_child_weight': 1.0}


In [None]:
# 모델링
xgb_model = XGBClassifier(n_estimators=200, learning_rate=round(best['learning_rate'], 5), 
                            max_depth=int(best['max_depth']), min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'], 5)
                           )
xgb_model.fit(X_train, y_train)
pred= xgb_model.predict(X_test)
print('정확도 : ', accuracy_score(y_test, pred))
print('혼돈행렬 : \n', confusion_matrix(y_test, pred))

정확도 :  0.9209849320102903
혼돈행렬 : 
 [[ 469   53   56]
 [  38  891   43]
 [  15   10 1146]]


## 예측값 구하기
### 머신용 데이터 정리

In [None]:
df_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    12970 non-null  object 
 1   CryoSleep     12970 non-null  object 
 2   Cabin1        12804 non-null  object 
 3   Cabin2        12804 non-null  float64
 4   Cabin3        12804 non-null  object 
 5   Destination   12704 non-null  object 
 6   VIP           12970 non-null  object 
 7   RoomService   12776 non-null  float64
 8   FoodCourt     12752 non-null  float64
 9   ShoppingMall  12760 non-null  float64
 10  Spa           12754 non-null  float64
 11  VRDeck        12766 non-null  float64
dtypes: float64(6), object(6)
memory usage: 1.2+ MB


In [None]:
y = df_dt[df_dt['Destination'].isnull()]
y.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 47 to 12968
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    266 non-null    object 
 1   CryoSleep     266 non-null    object 
 2   Cabin1        264 non-null    object 
 3   Cabin2        264 non-null    float64
 4   Cabin3        264 non-null    object 
 5   Destination   0 non-null      object 
 6   VIP           266 non-null    object 
 7   RoomService   264 non-null    float64
 8   FoodCourt     262 non-null    float64
 9   ShoppingMall  260 non-null    float64
 10  Spa           261 non-null    float64
 11  VRDeck        259 non-null    float64
dtypes: float64(6), object(6)
memory usage: 27.0+ KB


In [None]:
y_te = y.drop('Destination', axis = 1)

In [None]:
y_te.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 47 to 12968
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    266 non-null    object 
 1   CryoSleep     266 non-null    object 
 2   Cabin1        264 non-null    object 
 3   Cabin2        264 non-null    float64
 4   Cabin3        264 non-null    object 
 5   VIP           266 non-null    object 
 6   RoomService   264 non-null    float64
 7   FoodCourt     262 non-null    float64
 8   ShoppingMall  260 non-null    float64
 9   Spa           261 non-null    float64
 10  VRDeck        259 non-null    float64
dtypes: float64(6), object(5)
memory usage: 24.9+ KB


In [None]:
y_te.head()

Unnamed: 0,CryoSleep,Cabin2,Cabin3,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,A,B,C,D,E,F,G,Earth,Europa,Mars
47,True,-1.178292,True,False,-0.410815,-0.353903,-0.422433,-0.269849,-0.273905,0,0,0,0,0,1,0,0,0,1
128,False,-1.188073,True,False,-0.410815,-0.33037,-0.422433,0.192184,-0.046715,0,0,0,0,1,0,0,1,0,0
139,False,-1.135255,True,False,-0.410815,-0.353903,-0.422433,-0.269849,0.392299,0,0,0,0,0,1,0,1,0,0
347,False,-1.072656,True,False,0.306721,-0.353903,-0.422433,-0.266572,0.129988,0,0,0,0,0,0,1,1,0,0
430,True,-1.066787,True,False,-0.410815,-0.353903,-0.422433,-0.269849,-0.273905,0,0,0,0,0,0,1,1,0,0


In [None]:
# 원-핫 인코딩
# cabin1, homeplanet
encode = pd.get_dummies(df_dt['Cabin1'])
df_dt = df_dt.drop('Cabin1', axis =1)
df_dt = df_dt.join(encode)
encode = pd.get_dummies(df_dt['HomePlanet'])
df_dt = df_dt.drop('HomePlanet', axis =1)
df_dt = df_dt.join(encode)

In [None]:
# boolean으로 변환
df_dt['CryoSleep'] = df_dt['CryoSleep'].astype(bool)
df_dt['VIP'] = df_dt['VIP'].astype(bool)
df_dt['Cabin3'] = df_dt['Cabin3'].astype(bool)

In [None]:
# 스케일링
data_scaled(df_dt, col)

Unnamed: 0,CryoSleep,Cabin2,Cabin3,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,B,C,D,E,F,G,T,Earth,Europa,Mars
0,False,-1.174864,True,TRAPPIST-1e,False,-0.343152,-0.284403,-0.294957,-0.272138,-0.259273,...,1,0,0,0,0,0,0,0,1,0
1,False,-1.174864,True,TRAPPIST-1e,False,-0.174435,-0.278708,-0.252478,0.214786,-0.221901,...,0,0,0,0,1,0,0,1,0,0
2,False,-1.174864,True,TRAPPIST-1e,True,-0.276594,1.978442,-0.294957,5.683589,-0.217654,...,0,0,0,0,0,0,0,0,1,0
3,False,-1.174864,True,TRAPPIST-1e,False,-0.343152,0.527462,0.335431,2.680448,-0.095344,...,0,0,0,0,0,0,0,0,1,0
4,False,-1.172914,True,TRAPPIST-1e,False,0.125850,-0.240108,-0.038384,0.228976,-0.257575,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,True,1.741991,True,TRAPPIST-1e,False,-0.343152,-0.284403,-0.294957,-0.272138,-0.259273,...,0,0,0,0,0,1,0,1,0,0
12966,False,,True,TRAPPIST-1e,False,-0.343152,0.251567,-0.266071,-0.263269,-0.136963,...,0,0,0,0,0,0,0,1,0,0
12967,True,-0.597732,True,55 Cancri e,False,-0.343152,-0.284403,-0.294957,-0.272138,-0.259273,...,0,0,1,0,0,0,0,0,0,1
12968,False,-0.595783,True,,False,-0.343152,1.411465,-0.294957,-0.272138,0.184950,...,0,0,1,0,0,0,0,0,1,0


In [None]:
# 머신 돌릴 데이터
y_te = df_dt[df_dt['Destination'].isnull()]
y_te.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 47 to 12968
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CryoSleep     266 non-null    bool   
 1   Cabin2        264 non-null    float64
 2   Cabin3        266 non-null    bool   
 3   Destination   0 non-null      object 
 4   VIP           266 non-null    bool   
 5   RoomService   264 non-null    float64
 6   FoodCourt     262 non-null    float64
 7   ShoppingMall  260 non-null    float64
 8   Spa           261 non-null    float64
 9   VRDeck        259 non-null    float64
 10  A             266 non-null    uint8  
 11  B             266 non-null    uint8  
 12  C             266 non-null    uint8  
 13  D             266 non-null    uint8  
 14  E             266 non-null    uint8  
 15  F             266 non-null    uint8  
 16  G             266 non-null    uint8  
 17  T             266 non-null    uint8  
 18  Earth         266 non-null 

In [None]:
# label값 제외
y_te = y_te.drop('Destination', axis =1)

### 예측값 넣을 df 정의

In [None]:
data = pd.read_excel('C:/Users/admin/Desktop/sparta/train_test_filled_hp_cs_vp.xlsx')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12970 non-null  object 
 2   CryoSleep     12970 non-null  bool   
 3   Cabin1        12804 non-null  object 
 4   Cabin2        12804 non-null  float64
 5   Combi         12804 non-null  object 
 6   Cabin3        12804 non-null  object 
 7   Cabin         12804 non-null  object 
 8   Destination   12704 non-null  object 
 9   Age           12700 non-null  float64
 10  VIP           12970 non-null  bool   
 11  RoomService   12776 non-null  float64
 12  FoodCourt     12752 non-null  float64
 13  ShoppingMall  12760 non-null  float64
 14  Spa           12754 non-null  float64
 15  VRDeck        12766 non-null  float64
 16  Name          12676 non-null  object 
 17  Transported   8693 non-null   float64
dtypes: bool(2), float64(8), ob

In [None]:
q = data[data['Destination'].isna()]
q.head() # Destination 값 넣을 df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin1,Cabin2,Combi,Cabin3,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
47,0045_02,Mars,True,F,10.0,F10,P,F/10/P,,19.0,False,0.0,0.0,0.0,0.0,0.0,Mass Chmad,1.0
128,0138_02,Earth,False,E,5.0,E5,P,E/5/P,,34.0,False,0.0,22.0,0.0,564.0,207.0,Monah Gambs,0.0
139,0152_01,Earth,False,F,32.0,F32,P,F/32/P,,41.0,False,0.0,0.0,0.0,0.0,607.0,Andan Estron,0.0
347,0382_01,Earth,False,G,64.0,G64,P,G/64/P,,23.0,False,348.0,0.0,0.0,4.0,368.0,Blanie Floydendley,0.0
430,0462_01,Earth,True,G,67.0,G67,S,G/67/S,,50.0,False,0.0,0.0,0.0,0.0,0.0,Ronia Sosanturney,0.0


In [None]:
q.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 47 to 12968
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   266 non-null    object 
 1   HomePlanet    266 non-null    object 
 2   CryoSleep     266 non-null    bool   
 3   Cabin1        264 non-null    object 
 4   Cabin2        264 non-null    float64
 5   Combi         264 non-null    object 
 6   Cabin3        264 non-null    object 
 7   Cabin         264 non-null    object 
 8   Destination   0 non-null      object 
 9   Age           258 non-null    float64
 10  VIP           266 non-null    bool   
 11  RoomService   264 non-null    float64
 12  FoodCourt     262 non-null    float64
 13  ShoppingMall  260 non-null    float64
 14  Spa           261 non-null    float64
 15  VRDeck        259 non-null    float64
 16  Name          263 non-null    object 
 17  Transported   174 non-null    float64
dtypes: bool(2), float64(8), obj

### 예측값 내보내기

In [None]:
q['predict_dt']= xgb_model.predict(y_te)

In [None]:
q['predict_dt'].value_counts()

0    122
2     76
1     68
Name: predict_dt, dtype: int64

In [None]:
# 'TRAPPIST-1e':0, '55 Cancri e':1, 'PSO J318.5-22':2 로 변환시킨거 되돌리기
q['predict_dt'] = q['predict_dt'].map({0:'TRAPPIST-1e', 1:'55 Cancri e', 2:'PSO J318.5-22' })

In [None]:
q['predict_dt'].value_counts() # 변경 확인

TRAPPIST-1e      122
PSO J318.5-22     76
55 Cancri e       68
Name: predict_dt, dtype: int64

In [None]:
# 예측한 값 엑셀로 내보내기
q.to_excel('C:/Users/admin/Desktop/sparta/Destination_fillna.xlsx')