# Set up

## Packages

In [807]:
import pandas as pd
import numpy as np

## Reading Data

In [808]:
train_csv = pd.read_csv("train.csv", index_col="PassengerId")
test_csv = pd.read_csv("test.csv", index_col="PassengerId")

n = train_csv.shape[0]

training_data = train_csv[train_csv.columns.delete([-1])]
Y_train = train_csv["Transported"].astype("int")

test_data = test_csv

training_data

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre


In [809]:
test_data

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [810]:
Y_train

PassengerId
0001_01    0
0002_01    1
0003_01    0
0003_02    0
0004_01    1
          ..
9276_01    0
9278_01    0
9279_01    1
9280_01    0
9280_02    1
Name: Transported, Length: 8693, dtype: int32

## Formatting Training Data

In [811]:
parameters = []
test_parameters = []

### HomePlanet
As shown, HomePlanet has...

In [812]:
f"{\
    training_data["HomePlanet"].nunique()\
} unique planets"

'3 unique planets'

With such a small number, we can one-hot this column

In [813]:
def format_planet(data):
    return pd.get_dummies(data["HomePlanet"])

p_training = format_planet(training_data)
parameters.append(p_training)

p_test = format_planet(test_data)
test_parameters.append(p_test)



### Cabin
We'll take the letter of the cabin, one-hot it, and the last letter and one-hot it, with a specific category in each for those that are NA.
We'll ignore the numerical part of the cabin, since it doesn't seem to have any implicit meaning.

In [814]:
cabin_data = training_data['Cabin'].astype(str)
t_cabin_data = test_data['Cabin'].astype(str)

def format_cabin(data):
    splitted = data.str.split("/", expand=True)
    cabin_letter = pd.get_dummies(splitted[0])
    cabin_type = pd.get_dummies(splitted[2])

    return (cabin_letter, cabin_type)

cabin_letter, cabin_type = format_cabin(cabin_data)
t_cabin_letter, t_cabin_type = format_cabin(t_cabin_data)

print(cabin_letter)
print(cabin_type)

parameters.append(cabin_letter)
parameters.append(cabin_type)

test_parameters.append(t_cabin_letter)
test_parameters.append(t_cabin_type)

                 A      B      C      D      E      F      G      T    nan
PassengerId                                                               
0001_01      False   True  False  False  False  False  False  False  False
0002_01      False  False  False  False  False   True  False  False  False
0003_01       True  False  False  False  False  False  False  False  False
0003_02       True  False  False  False  False  False  False  False  False
0004_01      False  False  False  False  False   True  False  False  False
...            ...    ...    ...    ...    ...    ...    ...    ...    ...
9276_01       True  False  False  False  False  False  False  False  False
9278_01      False  False  False  False  False  False   True  False  False
9279_01      False  False  False  False  False  False   True  False  False
9280_01      False  False  False  False   True  False  False  False  False
9280_02      False  False  False  False   True  False  False  False  False

[8693 rows x 9 columns]


### Destination
We'll one-hot this as well (since there are only 3, non-numerically related values)

In [815]:
destination_data = training_data['Destination'].astype(str)
t_destination_data = test_data['Destination'].astype(str)

def format_dest(data):
    return pd.get_dummies(data)

dest = format_dest(destination_data)
parameters.append(dest)

t_dest = format_dest(t_destination_data)
test_parameters.append(t_dest)

dest

Unnamed: 0_level_0,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0001_01,False,False,True,False
0002_01,False,False,True,False
0003_01,False,False,True,False
0003_02,False,False,True,False
0004_01,False,False,True,False
...,...,...,...,...
9276_01,True,False,False,False
9278_01,False,True,False,False
9279_01,False,False,True,False
9280_01,True,False,False,False


### Age
We'll standardize this

In [816]:
age_data = training_data["Age"]
t_age_data = test_data["Age"]

def format_age(data):
    return (data - data.mean()) / data.std()

parameters.append(
    format_age(age_data)
)

test_parameters.append(
    format_age(t_age_data)
)

age_data

PassengerId
0001_01    39.0
0002_01    24.0
0003_01    58.0
0003_02    33.0
0004_01    16.0
           ... 
9276_01    41.0
9278_01    18.0
9279_01    26.0
9280_01    32.0
9280_02    44.0
Name: Age, Length: 8693, dtype: float64

### Room Service & Shopping Mall & Food Court
Standardize each

In [817]:
rs_data = training_data['RoomService'].astype(float)
t_rs_data = test_data['RoomService'].astype(float)

m_data = training_data['ShoppingMall'].astype(float)
t_m_data = test_data['ShoppingMall'].astype(float)

fc_data = training_data['FoodCourt'].astype(float)
t_fc_data = test_data['FoodCourt'].astype(float)

def format_bill(data):
    return (data - data.mean())/ data.std()

parameters.append(format_bill(m_data))
parameters.append(format_bill(rs_data))
parameters.append(format_bill(fc_data))

test_parameters.append(format_bill(t_rs_data))
test_parameters.append(format_bill(t_m_data))
test_parameters.append(format_bill(t_fc_data))

m_data

PassengerId
0001_01       0.0
0002_01      25.0
0003_01       0.0
0003_02     371.0
0004_01     151.0
            ...  
9276_01       0.0
9278_01       0.0
9279_01    1872.0
9280_01       0.0
9280_02       0.0
Name: ShoppingMall, Length: 8693, dtype: float64

## Further Formatting and Predicting NA Data

### Making DF of non-na parameters

In [818]:
non_na = pd.DataFrame(index=training_data.index.union(test_data.index))
def add_parameter(p, df): 
    return p.merge(df, left_index=True, right_index=True)

# Added testing data since that also contains the appropriate data
for parameter, t_parameter in zip(parameters, test_parameters):
    combined = pd.concat([parameter, t_parameter])

    if type(combined) is pd.Series:
        combined.name = parameter.name
    non_na = add_parameter(non_na, combined)

non_na.shape

(12970, 22)

### CryoSleep
Some are NA...

In [819]:
cryo_sleep = training_data["CryoSleep"]
print(f"% NA: {cryo_sleep.isna().sum() / n * 100}")
print(f"% Awake {(cryo_sleep == False).sum() / n * 100}")
print(f"% Sleeping {(cryo_sleep == True).sum() / n * 100}")

% NA: 2.4962613597147127
% Awake 62.56758311284942
% Sleeping 34.93615552743587


In [820]:
# Add testing data's cryo category
cryo_sleep_combined = pd.concat([cryo_sleep, test_data["CryoSleep"]])
cryo_sleep_combined

PassengerId
0001_01    False
0002_01    False
0003_01    False
0003_02    False
0004_01    False
           ...  
9266_02     True
9269_01    False
9271_01     True
9273_01    False
9277_01     True
Name: CryoSleep, Length: 12970, dtype: object

...however the NA population is much smaller than the avaliable population. We'll try predicting it instead using a random forest.

In [821]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

cryo_y_train = cryo_sleep_combined.loc[~cryo_sleep_combined.isna()].astype(int)\
    .sort_index()
cryo_X_train = non_na.loc[~cryo_sleep_combined.isna()].astype(float)\
    .sort_index()

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, cryo_X_train, cryo_y_train)

scores

array([0.92496051, 0.93167457, 0.93009479, 0.92812006, 0.92772512])

The accuracy of the tree is... about acceptable, so we'll train it on all of the data, and then predict [vip] for the NA values

In [822]:
cryo_model = model.fit(cryo_X_train, cryo_y_train)

cryo_data = training_data['CryoSleep']
t_cryo_data = test_data['CryoSleep']

pd.options.mode.chained_assignment = None

for i in cryo_data.index:
    if pd.isna(cryo_data[i]):
        cryo_data[i] = \
            cryo_model.predict(non_na.loc[[i], :])

for i in t_cryo_data.index:
    if pd.isna(t_cryo_data[i]):
        t_cryo_data[i] = \
            cryo_model.predict(non_na.loc[[i], :])
        
pd.options.mode.chained_assignment = 'warn'

parameters.append(cryo_data)
test_parameters.append(t_cryo_data)

### VIPs

In [823]:
vip = training_data["VIP"]

print(f"% NA: {vip.isna().sum() / n * 100}")
print(f"% VIP {(vip == False).sum() / n * 100}")
print(f"% Non-VIP {(vip == True).sum() / n * 100}")

% NA: 2.3352122397331185
% VIP 95.37558955481423
% Non-VIP 2.289198205452663


In [824]:
# Add testing data's cryo category
vip_combined = pd.concat([vip, test_data["VIP"]])
vip_combined

PassengerId
0001_01    False
0002_01    False
0003_01     True
0003_02    False
0004_01    False
           ...  
9266_02    False
9269_01    False
9271_01    False
9273_01    False
9277_01    False
Name: VIP, Length: 12970, dtype: object

...however the NA population is much smaller than the avaliable population. We'll try predicting it instead using a random forest.

In [825]:
vip_y_train = vip_combined.loc[~vip_combined.isna()].astype(int)
vip_X_train = non_na.loc[~vip_combined.isna()].astype(float)

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, vip_X_train, vip_y_train)
scores

array([0.97790927, 0.97475345, 0.97672584, 0.97554241, 0.97395422])

The accuracy of the tree is also good, so we'll train it on all of the data, and then predict [VIP] for the NA values

In [826]:
vip_model = model.fit(vip_X_train, vip_y_train)

vip_data = training_data['VIP']
t_vip_data = test_data['VIP']

pd.options.mode.chained_assignment = None

for i in vip_data.index:
    if pd.isna(vip_data[i]):
        vip_data[i] = \
            vip_model.predict(non_na.loc[[i], :])

for i in t_vip_data.index:
    if pd.isna(t_vip_data[i]):
        t_vip_data[i] = \
            vip_model.predict(non_na.loc[[i], :])

pd.options.mode.chained_assignment = 'warn'

parameters.append(vip_data)
test_parameters.append(t_vip_data)

# Training and Making Predictions

## Adding Parameters into a DF

In [827]:
training_final = pd.DataFrame(index=training_data.index)
testing_final = pd.DataFrame(index=test_data.index)

for parameter in parameters:
    print(parameter)
    training_final = add_parameter(training_final, parameter)

for testing_parameter in test_parameters:
    testing_final = add_parameter(testing_final, testing_parameter)

             Earth  Europa   Mars
PassengerId                      
0001_01      False    True  False
0002_01       True   False  False
0003_01      False    True  False
0003_02      False    True  False
0004_01       True   False  False
...            ...     ...    ...
9276_01      False    True  False
9278_01       True   False  False
9279_01       True   False  False
9280_01      False    True  False
9280_02      False    True  False

[8693 rows x 3 columns]
                 A      B      C      D      E      F      G      T    nan
PassengerId                                                               
0001_01      False   True  False  False  False  False  False  False  False
0002_01      False  False  False  False  False   True  False  False  False
0003_01       True  False  False  False  False  False  False  False  False
0003_02       True  False  False  False  False  False  False  False  False
0004_01      False  False  False  False  False   True  False  False  False
...     

## Training our model

These are the results we get from the classifier on the training data...

In [828]:
final_model = RandomForestClassifier()
scores = cross_val_score(final_model, training_final, Y_train)
scores

array([0.74238068, 0.7492812 , 0.74985624, 0.75604143, 0.7589183 ])

These are acceptable enough to me!
## Making predictions

In [830]:
testing_final = testing_final[training_final.columns]
final_model = final_model.fit(training_final, Y_train)
predictions = final_model.predict(testing_final)

In [831]:
predictions

array([1, 0, 1, ..., 1, 0, 0])

## Outputting to csv

In [834]:
formatted = pd.DataFrame(predictions, index=testing_final.index)

formatted

Unnamed: 0_level_0,0
PassengerId,Unnamed: 1_level_1
0013_01,1
0018_01,0
0019_01,1
0021_01,1
0023_01,1
...,...
9266_02,1
9269_01,0
9271_01,1
9273_01,0


In [841]:
formatted.to_csv("submission.csv", columns=["Transported"])

KeyError: "None of [Index(['PassengerId', 'Transported'], dtype='object')] are in the [columns]"