### 목표
- 2912년 우주항해를 하던 타이타닉호의 승무원,승선인원들을 찾아라!
- 운행중에 우주 먼지를 만나서 시스템 오류로 강제 전송됨
- 누가 전송되었는지 찾아서 복귀를 시켜야한다
- 분류문제=>classify

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
space_model = DecisionTreeClassifier()

In [3]:
train = pd.read_csv("./data/space/train.csv")
test = pd.read_csv("./data/space/test.csv")

#### 모델링 실습
1. 데이터 탐색
    - 전체 데이터 수 확인, 컬럼 수 확인
    - 컬럼 종류 확인(범주형, 수치형)
    - 결측치 처리
    - 인코딩 처리
2. 모델링
    - 모델 정의
    - 모델 학습
        1. 학습에 사용할 컬럼 선택
        2. 문제와 정답으로 분리
        3. 훈련데이터 검증데이터로 분리
    - 모델 예측
        - 검증 데이터 예측
    - 모델 평가
        - 검증데이터 평가(정확도)
    - 모델 최적화(하이퍼파라미터 튜닝)
3. 테스트 데이터 예측 후 kaggle 사이트에 업로드

In [4]:
# 데이터 탐색
# 전체 데이터 수 확인, 컬럼 수 확인
print('훈련용 데이터',train.shape)  
print('평가용 데이터',test.shape)   
train.columns

훈련용 데이터 (8693, 14)
평가용 데이터 (4277, 13)


Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [5]:
train.describe(include='all')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
count,8693,8492,8476,8494,8511,8514.0,8490,8512.0,8510.0,8485.0,8510.0,8505.0,8493,8693
unique,8693,3,2,6560,3,,2,,,,,,8473,2
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,,False,,,,,,Gollux Reedall,True
freq,1,4602,5439,8,5915,,8291,,,,,,2,4378
mean,,,,,,28.82793,,224.687617,458.077203,173.729169,311.138778,304.854791,,
std,,,,,,14.489021,,666.717663,1611.48924,604.696458,1136.705535,1145.717189,,
min,,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,
25%,,,,,,19.0,,0.0,0.0,0.0,0.0,0.0,,
50%,,,,,,27.0,,0.0,0.0,0.0,0.0,0.0,,
75%,,,,,,38.0,,47.0,76.0,27.0,59.0,46.0,,


In [6]:
# 범주형 - PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name 
# 수치형 - Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck

In [7]:
# 결측치 처리
# train 결측치 x 컬럼 : PassengerId
# test 결측치 x 컬럼 : PassengerId 

In [8]:
# 결측치 처리
# train 결측치 x 컬럼 : PassengerId
# test 결측치 x 컬럼 : PassengerId 

In [9]:
# 기술통계확인 - 범주형
train[['HomePlanet','CryoSleep','Cabin',
       'Destination','VIP','Name' ]].describe()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name
count,8492,8476,8494,8511,8490,8493
unique,3,2,6560,3,2,8473
top,Earth,False,G/734/S,TRAPPIST-1e,False,Gollux Reedall
freq,4602,5439,8,5915,8291,2


In [10]:
train['HomePlanet'].fillna('Earth',inplace=True)
train['VIP'].fillna(False,inplace=True)
test['HomePlanet'].fillna('Earth',inplace=True)
test['VIP'].fillna(False,inplace=True)

In [11]:
# 기술통계확인 - 수치형
train[['Age','RoomService','FoodCourt',
       'ShoppingMall','Spa','VRDeck']].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [12]:
train['Spa'].fillna(312,inplace=True)
test['Spa'].fillna(312,inplace=True)
train['ShoppingMall'].fillna(174,inplace=True)
test['ShoppingMall'].fillna(174,inplace=True)
train['RoomService'].fillna(225,inplace=True)
test['RoomService'].fillna(225,inplace=True)
train['FoodCourt'].fillna(459,inplace=True)
test['FoodCourt'].fillna(459,inplace=True)
train['VRDeck'].fillna(305,inplace=True)
test['VRDeck'].fillna(305,inplace=True)

In [13]:
train['Age'].fillna(27,inplace=True)

test['Age'].fillna(27,inplace=True)


In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   bool   
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(2), float64(6), object(6)
memory usage: 832.1+ KB


In [15]:
# 결측치 처리
# cryosleep 문자 => boolean타입이라 인코딩할 필요 없음

In [16]:
train['CryoSleep'].fillna(False,inplace=True)

In [17]:
test['CryoSleep'].fillna(False,inplace=True)

In [18]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [19]:
train['Destination'].fillna('TRAPPIST-1e',inplace=True)
test['Destination'].fillna('TRAPPIST-1e',inplace=True)

In [20]:
type_onehot=pd.get_dummies(train['Destination'])
type_onehot
type_onehot_test=pd.get_dummies(test['Destination'])
type_onehot_test

Unnamed: 0,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,0,0,1
1,0,0,1
2,1,0,0
3,0,0,1
4,0,0,1
...,...,...,...
4272,0,0,1
4273,0,0,1
4274,1,0,0
4275,0,0,1


In [21]:
train = pd.concat([train,type_onehot], axis=1)
train
test= pd.concat([test,type_onehot_test], axis=1)
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,0,0,1
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,0,0,1
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1,0,0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,0,0,1
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,0,0,1
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,0,0,1
4274,9271_01,Mars,True,D/296/P,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,1,0,0
4275,9273_01,Europa,False,D/297/P,TRAPPIST-1e,27.0,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,0,0,1


In [22]:
#  /기준으로 슬라이싱한 후 .A~G 라벨 인코딩 하고 결측치는 3 넣기
train['deck']=train['Cabin'].str[0]
train
test['deck']=test['Cabin'].str[0]
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,deck
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,0,0,1,G
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,0,0,1,F
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1,0,0,C
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,0,0,1,C
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,0,0,1,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,0,0,1,G
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,0,0,1,
4274,9271_01,Mars,True,D/296/P,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,1,0,0,D
4275,9273_01,Europa,False,D/297/P,TRAPPIST-1e,27.0,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,0,0,1,D


In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    8693 non-null   object 
 1   HomePlanet     8693 non-null   object 
 2   CryoSleep      8693 non-null   bool   
 3   Cabin          8494 non-null   object 
 4   Destination    8693 non-null   object 
 5   Age            8693 non-null   float64
 6   VIP            8693 non-null   bool   
 7   RoomService    8693 non-null   float64
 8   FoodCourt      8693 non-null   float64
 9   ShoppingMall   8693 non-null   float64
 10  Spa            8693 non-null   float64
 11  VRDeck         8693 non-null   float64
 12  Name           8493 non-null   object 
 13  Transported    8693 non-null   bool   
 14  55 Cancri e    8693 non-null   uint8  
 15  PSO J318.5-22  8693 non-null   uint8  
 16  TRAPPIST-1e    8693 non-null   uint8  
 17  deck           8494 non-null   object 
dtypes: bool(

In [24]:
train['deck'].fillna('F',inplace=True)
test['deck'].fillna('F',inplace=True)

method_dict = {
    'A':0,
    'B':1,
    'C':2,
    'D':3,
    'E':4,
    'F':5,
    'G':6,
    'T':7
}
deck_label=train['deck'].map(method_dict)
deck_label_test=test['deck'].map(method_dict)

deck_label

0       1
1       5
2       0
3       0
4       5
       ..
8688    0
8689    6
8690    6
8691    4
8692    4
Name: deck, Length: 8693, dtype: int64

In [25]:
train['deck_middle']=train.Cabin.str.split('/').str[1]
train['deck_middle']
test['deck_middle']=train.Cabin.str.split('/').str[1]
test['deck_middle']
train['deck_middle'].fillna(427,inplace=True)
test['deck_middle'].fillna(427,inplace=True)

In [26]:
train['deck_middle'].median()


427.0

In [27]:
train[train['deck'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,deck,deck_middle


In [28]:
train['deck'] = deck_label
train
test['deck'] = deck_label_test
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,deck,deck_middle
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,0,0,1,6,0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,0,0,1,5,0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1,0,0,2,0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,0,0,1,2,0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,0,0,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,0,0,1,6,298
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,0,0,1,5,853
4274,9271_01,Mars,True,D/296/P,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,1,0,0,3,937
4275,9273_01,Europa,False,D/297/P,TRAPPIST-1e,27.0,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,0,0,1,3,143


In [29]:
train['deck_num']=train['Cabin'].str[-1]
train
test['deck_num']=test['Cabin'].str[-1]
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,deck,deck_middle,deck_num
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,0,0,1,6,0,S
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,0,0,1,5,0,S
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1,0,0,2,0,S
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,0,0,1,2,0,S
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,0,0,1,5,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,0,0,1,6,298,S
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,0,0,1,5,853,
4274,9271_01,Mars,True,D/296/P,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,1,0,0,3,937,P
4275,9273_01,Europa,False,D/297/P,TRAPPIST-1e,27.0,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,0,0,1,3,143,P


In [30]:
train['deck_num'].describe()

count     8494
unique       2
top          S
freq      4288
Name: deck_num, dtype: object

In [31]:
train['deck_num'].fillna('S',inplace=True)
test['deck_num'].fillna('S',inplace=True)

m_dict = {
    'S':0,
    'P':1
}
deck_num=train['deck_num'].map(m_dict)
deck_num
deck_num_test=test['deck_num'].map(m_dict)



In [32]:
train['deck_num'] = deck_num
train
test['deck_num'] = deck_num_test
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,deck,deck_middle,deck_num
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,0,0,1,6,0,0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,0,0,1,5,0,0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,1,0,0,2,0,0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,0,0,1,2,0,0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,0,0,1,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,0,0,1,6,298,0
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,0,0,1,5,853,0
4274,9271_01,Mars,True,D/296/P,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,1,0,0,3,937,1
4275,9273_01,Europa,False,D/297/P,TRAPPIST-1e,27.0,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,0,0,1,3,143,1


In [33]:
train[train['FoodCourt'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,deck,deck_middle,deck_num


In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    8693 non-null   object 
 1   HomePlanet     8693 non-null   object 
 2   CryoSleep      8693 non-null   bool   
 3   Cabin          8494 non-null   object 
 4   Destination    8693 non-null   object 
 5   Age            8693 non-null   float64
 6   VIP            8693 non-null   bool   
 7   RoomService    8693 non-null   float64
 8   FoodCourt      8693 non-null   float64
 9   ShoppingMall   8693 non-null   float64
 10  Spa            8693 non-null   float64
 11  VRDeck         8693 non-null   float64
 12  Name           8493 non-null   object 
 13  Transported    8693 non-null   bool   
 14  55 Cancri e    8693 non-null   uint8  
 15  PSO J318.5-22  8693 non-null   uint8  
 16  TRAPPIST-1e    8693 non-null   uint8  
 17  deck           8693 non-null   int64  
 18  deck_mid

In [35]:
train['FoodCourt'].astype('int32')
train['ShoppingMall'].astype('int32')
train['VRDeck'].astype('int32')
train['Spa'].astype('int32')
train['deck_num'].astype('int32')
train['deck'].astype('int32')
train['RoomService'].astype('int32')
train['deck_middle'].astype('int32')

0          0
1          0
2          0
3          0
4          1
        ... 
8688      98
8689    1499
8690    1500
8691     608
8692     608
Name: deck_middle, Length: 8693, dtype: int32

In [36]:
a=train['PassengerId'].str[:4].value_counts()
ps=[]
for i in train['PassengerId']:
    if i[:4] in a.index:
        ps.append(a.loc[i[:4]])

In [49]:
train['PassengerId'].str[:4].value_counts()

4498    8
8168    8
8728    8
8796    8
8956    8
       ..
3483    1
3480    1
3478    1
3473    1
4620    1
Name: PassengerId, Length: 6217, dtype: int64

In [37]:
train['Passenger']=ps

In [38]:
a=test['PassengerId'].str[:4].value_counts()
ps=[]
for i in test['PassengerId']:
    if i[:4] in a.index:
        ps.append(a.loc[i[:4]])
test['Passenger']=ps

In [39]:
train['Passenger'].describe()

count    8693.000000
mean        2.035546
std         1.596347
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max         8.000000
Name: Passenger, dtype: float64

In [40]:
train['Passenger'].fillna(2,inplace=True)
test['Passenger'].fillna(2,inplace=True)

In [41]:
X_train= train[['deck_middle','TRAPPIST-1e','deck_num','ShoppingMall','deck','CryoSleep','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]
y_train= train['Transported']
X_train1,X_val,y_train1,y_val=train_test_split(X_train,y_train,test_size=0.3, random_state=13)                                                                                                                       
space_model.fit(X_train1,y_train1)
pre= space_model.predict(X_val)
pre
score = accuracy_score(y_val,pre)
print("정확도 : ",score)

정확도 :  0.7469325153374233


In [42]:
# 캐글 업로드
X_test= test[['deck_middle','TRAPPIST-1e','deck_num','ShoppingMall','deck','CryoSleep','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']]     


In [43]:
test_pre=space_model.predict(X_test)
test_pre

array([ True, False,  True, ...,  True, False, False])

In [44]:
submission = pd.read_csv("./data/space/sample_submission.csv")
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [45]:
submission['Transported'] = test_pre
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,False


In [46]:
# csv파일로 저장 index=False -> 이걸 안하면 인덱스도 컬럼으로 저장된다
submission.to_csv("./data/space/myPrediction.csv", index=False)