In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from catboost import CatBoostClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv').set_index('PassengerId')

In [3]:
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [4]:
def basic_prep(df):
    df[['deck', 'number', 'side']] = pd.DataFrame(
        df.Cabin.apply(lambda x: str(x).split('/')).tolist(),
        index= df.index,
        columns=['deck', 'number', 'side'],
    )
    df.number = df.number.astype(float)
    df['total_spend'] = df[spend_cols].sum(axis=1)
    df[["CryoSleep", "VIP"]] = df[["CryoSleep", "VIP"]].replace({True: 1, False: 0}).astype(str)
    return df

In [5]:
def fill_missing_basic(df, float_cols, object_cols):
    df[spend_cols] = df[spend_cols].fillna(df[spend_cols].mean())
    df[float_cols] = df[float_cols].fillna(df[float_cols].mean().to_dict())
    object_dict = {k:v[0] for k, v in df[object_cols].mode().to_dict().items()}
    df[object_cols] = df[object_cols].fillna(object_dict)
    return df

In [6]:
train = basic_prep(train)
train["Transported"] = train["Transported"].replace({True: 1, False: 0})
test = basic_prep(test)

In [7]:
float_cols = list(train.dtypes[train.dtypes==float].index)
object_cols = list(train.dtypes[train.dtypes==object].index)

In [8]:
float_cols

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'number',
 'total_spend']

In [9]:
object_cols

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'VIP',
 'Name',
 'deck',
 'side']

In [10]:
drop_cols = ['Name', 'PassengerId', 'Cabin']
# change_cols = ['VIP', 'CryoSleep']
for i in drop_cols:
    object_cols.remove(i)
# for i in change_cols:
#     object_cols.append(i)
#     float_cols.remove(i)

In [11]:
train = fill_missing_basic(train, float_cols, object_cols)

test = fill_missing_basic(test, float_cols, object_cols)

In [12]:
object_cols

['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'side']

In [13]:
train.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
deck              0
number            0
side              0
total_spend       0
dtype: int64

In [14]:
cat_inds = [list(train[object_cols+float_cols]).index(col) for col in object_cols]

In [15]:
train[object_cols].isnull().sum()

HomePlanet     0
CryoSleep      0
Destination    0
VIP            0
deck           0
side           0
dtype: int64

In [16]:
model = CatBoostClassifier()
model.fit(
    np.array(train[object_cols+float_cols]),
    np.array(train.Transported),
    cat_features=cat_inds,
)

Learning rate set to 0.025939
0:	learn: 0.6806432	total: 87.3ms	remaining: 1m 27s
1:	learn: 0.6693036	total: 103ms	remaining: 51.2s
2:	learn: 0.6581168	total: 117ms	remaining: 38.9s
3:	learn: 0.6474228	total: 130ms	remaining: 32.5s
4:	learn: 0.6375722	total: 152ms	remaining: 30.2s
5:	learn: 0.6287083	total: 166ms	remaining: 27.4s
6:	learn: 0.6199155	total: 179ms	remaining: 25.4s
7:	learn: 0.6116425	total: 196ms	remaining: 24.3s
8:	learn: 0.6039731	total: 210ms	remaining: 23.2s
9:	learn: 0.5960455	total: 225ms	remaining: 22.3s
10:	learn: 0.5890541	total: 241ms	remaining: 21.7s
11:	learn: 0.5822282	total: 267ms	remaining: 22s
12:	learn: 0.5764804	total: 287ms	remaining: 21.8s
13:	learn: 0.5718632	total: 301ms	remaining: 21.2s
14:	learn: 0.5661080	total: 318ms	remaining: 20.9s
15:	learn: 0.5611759	total: 330ms	remaining: 20.3s
16:	learn: 0.5563750	total: 342ms	remaining: 19.8s
17:	learn: 0.5519771	total: 354ms	remaining: 19.3s
18:	learn: 0.5475474	total: 364ms	remaining: 18.8s
19:	learn: 

<catboost.core.CatBoostClassifier at 0x7fe6a8956f50>

In [17]:
preds = model.predict(test[object_cols+float_cols])

In [18]:
test['Transported'] = preds

In [19]:
sample = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

In [30]:
submission = test[["Transported"]].replace({1: "True", 0: "False"})#, inplace=True).reset_index().to_csv('/kaggle/working/submission.csv', index=False)

In [25]:
submission.to_csv('/kaggle/working/submission.csv')

In [31]:
submission.head().values

array([['True'],
       ['False'],
       ['True'],
       ['True'],
       ['True']], dtype=object)

In [328]:
test[["Transported"]].replace({1: 'True', 0: 'False'})

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [322]:
test.reset_index()[['PassengerId', 'Transported']].to_csv('/kaggle/working/submission.csv',index=False)

In [28]:
sample.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [316]:
! ls /kaggle/input/spaceship-titanic/

sample_submission.csv  test.csv  train.csv
