In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

### Load and Preview CSV data

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [10]:
train_data.head(2)
train_data['Cabin'][0].split('/')

['B', '0', 'P']

In [6]:
deck_dict={'T':0, 'nan':1,'A':2, 'D':3, 'E':4, 'C':5, 'B':6, 'F':7, 'G':8}
def perpare_unclean_data(df):
    group_id = []
    member_group_id = []
    [group_id.append(passenger.split('_')[0]) for passenger in df['PassengerId']]
    [member_group_id.append(passenger.split('_')[1]) for passenger in df['PassengerId']]
    df['groupId'] = group_id
    df['member_group_id'] = member_group_id
    df['HomePlanet'] = df['HomePlanet'].replace('Europa', 'Earth')
    clean_cabin_data(df)
    return df

def clean_cabin_data(df):
    # New features - training set
    df['Cabin_deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
    df['Cabin_number'] = df['Cabin'].apply(lambda x: x.split('/')[1]).astype(int)
    df['Cabin_side'] = df['Cabin'].apply(lambda x: x.split('/')[2])

    df['Cabin_deck']=df['Cabin_deck'].apply(lambda x: deck_dict.get(x))
    return df

train_data = perpare_unclean_data(train_data)
test_data = perpare_unclean_data(test_data)

IndexError: list index out of range

In [479]:
train_data.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,groupId,member_group_id
0,0001_01,Earth,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,1
2,0003_01,Earth,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,1
3,0003_02,Earth,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,1


In [480]:
# Show percent of missing value
train_data.isna().sum()/train_data.shape[0]

PassengerId        0.000000
HomePlanet         0.023122
CryoSleep          0.024963
Cabin              0.022892
Destination        0.020936
Age                0.020591
VIP                0.023352
RoomService        0.020821
FoodCourt          0.021051
ShoppingMall       0.023927
Spa                0.021051
VRDeck             0.021627
Name               0.023007
Transported        0.000000
groupId            0.000000
member_group_id    0.000000
dtype: float64

In [481]:
cols = train_data.columns
cols[:5]

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination'], dtype='object')

In [482]:
# Show unique value of dataframe (with of Na value - nuique())
for col in cols:
    print(f'{col} has {(train_data[col].nunique())} value')

PassengerId has 8693 value
HomePlanet has 2 value
CryoSleep has 2 value
Cabin has 6560 value
Destination has 3 value
Age has 80 value
VIP has 2 value
RoomService has 1273 value
FoodCourt has 1507 value
ShoppingMall has 1115 value
Spa has 1327 value
VRDeck has 1306 value
Name has 8473 value
Transported has 2 value
groupId has 6217 value
member_group_id has 8 value


In [483]:
# Fill Na value with previous data
train_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='ffill', inplace=True)
train_data.isna().sum()

PassengerId        0
HomePlanet         0
CryoSleep          0
Cabin              0
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name               0
Transported        0
groupId            0
member_group_id    0
dtype: int64

In [484]:
train_data.dtypes

PassengerId         object
HomePlanet          object
CryoSleep             bool
Cabin               object
Destination         object
Age                float64
VIP                   bool
RoomService        float64
FoodCourt          float64
ShoppingMall       float64
Spa                float64
VRDeck             float64
Name                object
Transported           bool
groupId             object
member_group_id     object
dtype: object

In [485]:
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,groupId,member_group_id
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,13,1
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,18,1
2,0019_01,Earth,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,19,1
3,0021_01,Earth,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,21,1
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,23,1


In [486]:
# le = LabelEncoder()
# Encode non-number columns
def feature_encode(df, cols, input_type=None):
    if input_type == 'str':
        for col in cols:
            df[col] = df[col].astype(str)
            df[col] = LabelEncoder().fit_transform(df[col])
            df[col] = df[col].astype(float)
        return df
    for col in cols:
        df[col] = df[col].astype(str)
        df[col] = df[col].astype(float)
    return df
train_data = feature_encode(train_data, ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported'], 'str')
test_data = feature_encode(test_data, ['HomePlanet', 'CryoSleep', 'Destination', 'VIP'], 'str')
train_data.dtypes

PassengerId         object
HomePlanet         float64
CryoSleep          float64
Cabin               object
Destination        float64
Age                float64
VIP                float64
RoomService        float64
FoodCourt          float64
ShoppingMall       float64
Spa                float64
VRDeck             float64
Name                object
Transported        float64
groupId             object
member_group_id     object
dtype: object

In [487]:
# Drop unnecessary data
train_data.drop(['Name', 'Cabin', 'PassengerId'] , axis = 1 ,inplace = True)
test_data.drop(['Name', 'Cabin', 'PassengerId'] , axis = 1 ,inplace = True)

In [488]:
train_data.head(2)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,groupId,member_group_id
0,0.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
1,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,2,1


In [489]:
# test_data['member_group_id'] = test_data['member_group_id'].astype(float)
# df[col] = df[col].astype(str)
# df[col] = df[col].astype(float)

In [490]:
# Convert number to float 
train_data = feature_encode(train_data, ['groupId', 'member_group_id'])
test_data = feature_encode(test_data, ['groupId', 'member_group_id'])

In [491]:
# Prepare X and y data for model 
X = train_data.drop('Transported' , axis =1 )
y = train_data['Transported']

In [492]:
num_vocal = []
[[num_vocal.append(x) for x in set(train_data[col])] for col in train_data.columns]
[[num_vocal.append(x) for x in set(test_data[col])] for col in test_data.columns]
num_vocal = list(set(num_vocal))
len(num_vocal)

9468

In [493]:
def normalization_data(df, layer):
    for col in df.columns:
        df[col] = layer_normal(df[col])
    return df

X = normalization_data(X, layer_normal)
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,groupId,member_group_id
0,-1.434825,-1.434825,-1.434158,-1.421826,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.434492,-1.434492
1,-1.434825,-1.434825,-1.434158,-1.426826,-1.434825,-1.398495,-1.431825,-1.426493,-1.251844,-1.42016,-1.434158,-1.434492
2,-1.434825,-1.434825,-1.434158,-1.415494,-1.434492,-1.420493,-0.242946,-1.434825,0.803281,-1.418493,-1.433825,-1.434492
3,-1.434825,-1.434825,-1.434158,-1.423826,-1.434825,-1.434825,-1.007202,-1.311171,-0.325271,-1.370498,-1.433825,-1.434158
4,-1.434825,-1.434825,-1.434158,-1.429492,-1.434825,-1.333835,-1.411494,-1.384497,-1.246511,-1.434158,-1.433492,-1.434492


In [494]:
# Split data
X_train , X_val , y_train , y_val = train_test_split(X, y, random_state = 12 , test_size =0.2)

In [495]:
X_train[:5]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,groupId,member_group_id
7503,-1.434492,-1.434492,-1.434158,-1.425159,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,1.240236,-1.434492
7300,-1.434492,-1.434492,-1.434158,-1.426159,-1.434825,-1.434825,-1.434825,-1.41716,-1.434825,-1.434825,1.16791,-1.434492
1853,-1.434825,-1.434492,-1.434158,-1.426493,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-0.776225,-1.432492
5962,-1.434825,-1.434492,-1.434825,-1.428492,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,0.671961,-1.433825
4805,-1.434825,-1.434492,-1.434158,-1.425826,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,0.274668,-1.434492


In [496]:
knn = KNeighborsClassifier(n_neighbors=8)

knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=8)

In [497]:
knn.predict(X_val)

array([0., 1., 0., ..., 1., 1., 0.])

In [467]:
knn.score(X_val, y_val)

0.7515813686026452

In [23]:
# Model with normal data
for neighbor in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    print(knn.score(X_val, y_val))

0.6520989074180563
0.6359976998274871
0.6262219666474985
0.6216216216216216
0.6244968372627947
0.61644623346751
0.6060954571592869
0.5928694652098907
0.5853939045428407


In [57]:
# Model score with 
# Separate PassengerId into group and member id
for neighbor in range(1,12):
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    print(knn.score(X_val, y_val))

0.6900517538815412
0.675100632547441
0.7326049453709028
0.7182288671650374
0.7360552041403106
0.7314548591144335
0.7515813686026452
0.7521564117308798
0.7510063254744106
0.7464059804485337
0.7556066705002875


In [263]:
# Model score 
# with Separate PassengerId into group and member id
# and Change HomePlanet Eu to Earth

# The score are not change from preview experiment
for neighbor in range(1,12):
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    print(knn.score(X_val, y_val))

0.6883266244968372
0.675100632547441
0.7326049453709028
0.7182288671650374
0.7360552041403106
0.7314548591144335
0.7515813686026452
0.7515813686026452
0.7510063254744106
0.7469810235767682
0.7556066705002875


In [498]:
# Model score 
# with Separate PassengerId into group and member id
# and Change HomePlanet Eu to Earth
# and Convert numberical data with normalization

# The score are not change from preview experiment
for neighbor in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    print(knn.score(X_val, y_val))

0.6894767107533065
0.675100632547441
0.7326049453709028
0.7182288671650374
0.7360552041403106
0.7314548591144335
0.7515813686026452
0.7515813686026452
0.7510063254744106


In [535]:
df_result

Unnamed: 0,PassengerId,Transported
0,0013_01,0.0
1,0018_01,0.0
2,0019_01,0.0
3,0021_01,1.0
4,0023_01,1.0
...,...,...
4272,9266_02,1.0
4273,9269_01,1.0
4274,9271_01,1.0
4275,9273_01,0.0


In [500]:
knn.predict(X_val)

array([0., 1., 0., ..., 1., 1., 0.])

In [528]:
result = pd.read_csv('test.csv')

In [531]:
df_result['PassengerId'] = result['PassengerId']

In [470]:
test_data = normalization_data(test_data, layer_normal)
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,groupId,member_group_id
0,-1.434825,-1.434492,-1.434158,-1.425826,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.430492,-1.434492
1,-1.434825,-1.434825,-1.434158,-1.428492,-1.434825,-1.434825,-1.431825,-1.434825,-0.493921,-1.434825,-1.428826,-1.434492
2,-1.434825,-1.434492,-1.434825,-1.424493,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.434825,-1.428492,-1.434492
3,-1.434825,-1.434825,-1.434158,-1.42216,-1.434825,-1.434825,0.782283,-1.434825,-1.374498,-1.239845,-1.427826,-1.434492
4,-1.434825,-1.434825,-1.434158,-1.428159,-1.434825,-1.431492,-1.434825,-1.22318,-1.434825,-1.434825,-1.427159,-1.434492


In [533]:
pre_test_data = knn.predict(test_data)

In [534]:
df_result['Transported'] = pre_test_data
# df_result['PassengerId'] = result['PassengerId']

In [509]:
pre_test_data

array([0., 0., 0., ..., 1., 0., 1.])

In [505]:
pre_test_data

array([0., 0., 0., ..., 1., 0., 1.])

In [525]:
df_result = pd.DataFrame(columns=['PassengerId', 'Transported'])

In [526]:
df_result

Unnamed: 0,PassengerId,Transported


In [536]:
df_result.to_csv('submission.csv', index=False)