In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.preprocessing import Binarizer, MultiLabelBinarizer, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import f_classif

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier, Perceptron
from sklearn.pipeline import Pipeline

from scipy.stats import pearsonr

In [2]:
DF_train = pd.read_csv('train.csv')
print(DF_train.shape)
DF_train.head(5)

(8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Booleanize 'Transported' to fix the two string entries, and then tally the results:

In [3]:
DF_train['Transported']=DF_train['Transported'].astype(bool)
DF_train.groupby(['Transported'])['Transported'].count()

Transported
False    4315
True     4378
Name: Transported, dtype: int64

SKIP TO BELOW IF KEEPING NULL-ENTRY ROWS IN TRAINING:

Find all rows with null entries, add tuple per such row: (null values for index, index), and sort descending.

In [4]:
# null-entry row removal cell 1/12

nulllist = []
for i in range(len(DF_train.index)):
    nulllist.append((DF_train.iloc[i].isnull().sum(),i))
nulllist.sort(reverse=True)

In [5]:
# null-entry row removal cell 2/12

for index in [0,100,1000,2000,2500]:
    print(index,': ',nulllist[index])

0 :  (3, 7682)
100 :  (2, 4544)
1000 :  (1, 5026)
2000 :  (1, 407)
2500 :  (0, 8151)


So of the 8700 entries, only about 2000-2500 of them have null values.  The remaining 6000 entries may be sufficient to train.  We'll have to add new categorical labels for the test set null values, but this should be fine.

Let's find out how many entries have null values:

In [6]:
# null-entry row removal cell 3/12

i=0
while nulllist[i][0]!=0:
    i+=1
print(i)

2087


Indeed we can see null values per row drops from 1 to 0 at that index:

In [7]:
# null-entry row removal cell 4/12

print(nulllist[2086])
print(nulllist[2087])

(1, 7)
(0, 8692)


So for the first 2087 indices of nulllist, we will remove row nulllist[index][1] from the data set.

In [8]:
# null-entry row removal cell 5/12

L=[]
for i in range(2087):
    L.append(nulllist[i][1])
DF_train=DF_train.drop(L)
DF_train.shape

(6606, 14)

Recheck 'Transported' distribution:

In [9]:
# null-entry row removal cell 6/12

DF_train.groupby(['Transported'])['Transported'].count()

Transported
False    3279
True     3327
Name: Transported, dtype: int64

Still about 50/50, so there should still be a good representation in the features.

Let's get our features ready, beginning with converting the 'Cabin' string to a tuple.

In [10]:
# null-entry row removal cell 7/12

DF_train['Cabin']=DF_train['Cabin'].str.replace('/','')
#DF_train['Cabin']=DF_train['Cabin'].str.split('/')
DF_train['Cabin']=DF_train['Cabin'].apply(lambda x: tuple(x))
#DF_train['Cabin']=DF_train['Cabin'].apply(lambda t: (t[0],int(t[1]),t[2]))
DF_train.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,"(B, 0, P)",TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,"(F, 0, S)",TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,"(A, 0, S)",TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,"(A, 0, S)",TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,"(F, 1, S)",TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Stringify categorical features.

In [11]:
# null-entry row removal cell 8/12

DF_train['CryoSleep']=DF_train['CryoSleep'].astype(str)
DF_train['VIP']=DF_train['VIP'].astype(str)

In [12]:
# null-entry row removal cell 9/12

ct = ColumnTransformer([
    ('labelEncode',OrdinalEncoder(),[1,2,4,6])
])

categorical_features = ct.fit_transform(DF_train)

mlb = MultiLabelBinarizer()
cabin_features=mlb.fit_transform(DF_train['Cabin'])

In [13]:
# null-entry row removal cell 10/12

categorical_features.shape, cabin_features.shape

((6606, 4), (6606, 20))

Grab 'Age' and Spending columns and force them into column vectors for concatenation.

In [14]:
# null-entry row removal cell 11/12

age=DF_train['Age'].to_numpy()[np.newaxis].transpose()
room=DF_train['RoomService'].to_numpy()[np.newaxis].transpose()
food=DF_train['FoodCourt'].to_numpy()[np.newaxis].transpose()
mall=DF_train['ShoppingMall'].to_numpy()[np.newaxis].transpose()
spa=DF_train['Spa'].to_numpy()[np.newaxis].transpose()
vr=DF_train['VRDeck'].to_numpy()[np.newaxis].transpose()

Concatenate feature numpy arrays and convert 'Transported' from boolean to int.

In [15]:
# null-entry row removal cell 12/12

x_train = np.hstack((categorical_features[:,0:3],age,categorical_features[:,3:4],room,food,mall,spa,vr,cabin_features))
y_train = DF_train['Transported'].astype(int)

RUN NEXT TWO CELLS IF INCLUDING NULL-ENTRY ROWS IN TRAINING:

In [122]:
## keep null rows cell 1/2

DF_train['HomePlanet']=DF_train['HomePlanet'].astype(str)
DF_train['CryoSleep']=DF_train['CryoSleep'].astype(str)
DF_train['Cabin']=DF_train['Cabin'].astype(str)
#DF_train['Cabin']=DF_train['Cabin'].str.split('/')
DF_train['Cabin']=DF_train['Cabin'].str.replace('/','')
DF_train['Cabin']=DF_train['Cabin'].str.replace('nan','')
DF_train['Cabin']=DF_train['Cabin'].apply(lambda x: tuple(x))
DF_train['Destination']=DF_train['Destination'].astype(str)
DF_train['VIP']=DF_train['VIP'].astype(str)

In [244]:
## keep null rows cell 2/2

ct = ColumnTransformer([
    ('labelEncode',OrdinalEncoder(),[1,2,4,6]),
    ('inputer',IterativeImputer(random_state = 5),[5,7,8,9,10,11])
])

categorical_features = ct.fit_transform(DF_train)

mlb = MultiLabelBinarizer()
cabin_features=mlb.fit_transform(DF_train['Cabin'])

x_train = np.hstack((categorical_features[:,0:3], categorical_features[:,4:5], categorical_features[:,3:4], categorical_features[:,5:], cabin_features))
y_train = DF_train['Transported'].astype(int)

Feature-label correlations in case we want to eliminate features that don't correlate with label.

In [16]:
# feature-label correlations
for i in range(x_train.shape[1]):
    print(i,pearsonr(x_train[:,i],y_train))

0 PearsonRResult(statistic=0.11180600546137501, pvalue=7.928303786165701e-20)
1 PearsonRResult(statistic=0.4628034821535721, pvalue=0.0)
2 PearsonRResult(statistic=-0.1237729225637928, pvalue=5.7033127849698774e-24)
3 PearsonRResult(statistic=-0.08255287690970188, pvalue=1.8187751348149044e-11)
4 PearsonRResult(statistic=-0.04226002786876561, pvalue=0.0005910994994036362)
5 PearsonRResult(statistic=-0.2472907818370837, pvalue=1.2773942483633394e-92)
6 PearsonRResult(statistic=0.055024650480644574, pvalue=7.648322367727202e-06)
7 PearsonRResult(statistic=0.011601728883922264, pvalue=0.345777830540326)
8 PearsonRResult(statistic=-0.21985426392033486, pvalue=3.9970291241669983e-73)
9 PearsonRResult(statistic=-0.20794964666038246, pvalue=1.9036144515022045e-65)
10 PearsonRResult(statistic=0.014063066445159254, pvalue=0.25310157181015214)
11 PearsonRResult(statistic=-0.014561338711457475, pvalue=0.23667240294320854)
12 PearsonRResult(statistic=0.028765107339321257, pvalue=0.0193876801187017

In [342]:
# corresponding columns to remove in case where nulls are included and not included
removed_nullcase = [4,7,10,11,12,13,15,16,17,18,20,26,29]
removed_nonullcase = [7,10,11,13,15,16,17,18,20,26,29]
not_features = [] # replace with one of above if this cell is run--USED FOR TEST SET
x_train = np.delete(x_train,removed_nonullcase,axis=1)
x_train.shape

(6606, 19)

In [None]:
# feature-wise correlations
for i in range(x_train.shape[1]):
    for j in range(x_train.shape[1]):
        if i<j:
            z=pearsonr(x_train[:,i],x_train[:,j])
            if z[1]<.05:
                print('(%s,%s)'%(i,j))
                print('     ',pearsonr(x_train[:,i],x_train[:,j]))

In [17]:
LR = LogisticRegressionCV(random_state = 3, max_iter = 10000, verbose = 1, n_jobs = 5, cv=5, fit_intercept=False)

In [18]:
model = LR.fit(x_train,y_train)
model.score(x_train,y_train)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    1.3s remaining:    2.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    1.4s finished


0.796094459582198

Let's do some hyperparameter tuning to see if we can get this higher.

In [19]:
penalties = ['l2','l1','elasticnet']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

for pen in penalties:
    for solve in solvers:
        if pen=='l1' and solve in ['newton-cg', 'lbfgs', 'sag']:
            continue
        if pen=='elasticnet' and solve!='saga':
            continue
        if pen=='elasticnet' and solve=='saga': #l1_ratios in model without warning message
            LR = LogisticRegressionCV(random_state = 3, max_iter = 10000, cv=5, n_jobs = 5, l1_ratios=[.5], solver=solve, penalty=pen, fit_intercept=False)
            model = LR.fit(x_train,y_train)
            print('Accuracy:  %s     Solver:  %s   Penalty:  %s' % (model.score(x_train,y_train),solve,pen))
            continue
        LR = LogisticRegressionCV(random_state = 3, max_iter = 10000, cv=5, n_jobs = 5, solver=solve, penalty=pen, fit_intercept=False)
        model = LR.fit(x_train,y_train)
        print('Accuracy:  %s     Solver:  %s   Penalty:  %s' % (model.score(x_train,y_train),solve,pen))

Accuracy:  0.7966999697244929     Solver:  newton-cg   Penalty:  l2
Accuracy:  0.796094459582198     Solver:  lbfgs   Penalty:  l2
Accuracy:  0.7968513472600666     Solver:  liblinear   Penalty:  l2
Accuracy:  0.7812594610959733     Solver:  sag   Penalty:  l2
Accuracy:  0.7812594610959733     Solver:  saga   Penalty:  l2
Accuracy:  0.7980623675446564     Solver:  liblinear   Penalty:  l1
Accuracy:  0.7815622161671208     Solver:  saga   Penalty:  l1
Accuracy:  0.7815622161671208     Solver:  saga   Penalty:  elasticnet


In [20]:
loss = ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
for los in loss:
    SGD = SGDClassifier(random_state=3, n_jobs=5, loss=los)
    model = SGD.fit(x_train,y_train)
    print('Accuracy:  %s     Loss:  %s' % (model.score(x_train,y_train),los))

Accuracy:  0.7848925219497427     Loss:  hinge
Accuracy:  0.6631849833484711     Loss:  log_loss
Accuracy:  0.5897668785952165     Loss:  modified_huber
Accuracy:  0.7544656372994247     Loss:  squared_hinge
Accuracy:  0.7833787465940054     Loss:  perceptron
Accuracy:  0.5392067817135937     Loss:  squared_error
Accuracy:  0.598698153194066     Loss:  huber
Accuracy:  0.29125037844383894     Loss:  epsilon_insensitive
Accuracy:  0.5888586133817741     Loss:  squared_epsilon_insensitive


In [21]:
penalties = ['l1','l2','elasticnet']
for pen in penalties:
    perceptron = Perceptron(n_jobs=5, random_state=3, penalty=pen)
    model = perceptron.fit(x_train,y_train)
    print('Accuracy:  %s       Penalty:  %s' % (model.score(x_train,y_train),pen))

Accuracy:  0.7839842567363003       Penalty:  l1
Accuracy:  0.7458371177717227       Penalty:  l2
Accuracy:  0.6780199818346957       Penalty:  elasticnet


Looks like we'll go with Logistic Regression with cross validation, liblinear solver and l1 penalty.

In [22]:
solve = 'liblinear'
pen = 'l1'
LR = LogisticRegressionCV(random_state = 3, max_iter = 10000, cv=5, n_jobs = 5, solver=solve, penalty=pen, fit_intercept=False)
model = LR.fit(x_train,y_train)
print('Accuracy:  %s     Solver:  %s   Penalty:  %s' % (model.score(x_train,y_train),solve,pen))

Accuracy:  0.7980623675446564     Solver:  liblinear   Penalty:  l1


Load in test set for analysis:

In [23]:
DF_test = pd.read_csv('test.csv')
DF_test.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [24]:
DF_test['HomePlanet']=DF_test['HomePlanet'].astype(str)
DF_test['CryoSleep']=DF_test['CryoSleep'].astype(str)
DF_test['Cabin']=DF_test['Cabin'].astype(str)
DF_test['Cabin']=DF_test['Cabin'].str.replace('/','')
DF_test['Cabin']=DF_test['Cabin'].str.replace('nan','')
DF_test['Cabin']=DF_test['Cabin'].apply(lambda x: tuple(x))
DF_test['Destination']=DF_test['Destination'].astype(str)
DF_test['VIP']=DF_test['VIP'].astype(str)


In [25]:
test_ct = ColumnTransformer([
    ('ordinalEncode',OrdinalEncoder(),[1,2,4,6]),
    ('inputer',IterativeImputer(random_state = 5),[5,7,8,9,10,11])
])

test_categorical_features = test_ct.fit_transform(DF_test)

mlb = MultiLabelBinarizer()
test_cabin_features=mlb.fit_transform(DF_test['Cabin'])

In [26]:
x_test = np.hstack((test_categorical_features[:,0:3], test_categorical_features[:,4:5], test_categorical_features[:,3:4], test_categorical_features[:,5:], test_cabin_features))

Feature removal for test set:

In [334]:
x_test = np.delete(x_test,not_features,axis=1)
x_test.shape

(4277, 30)

In [27]:
test_prediction = model.predict(x_test)

In [28]:
DF_submission = pd.concat([DF_test['PassengerId'],pd.DataFrame({'Transported':test_prediction})],axis=1)
DF_submission['Transported']=DF_submission['Transported'].astype(bool)

In [29]:
DF_submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [346]:
DF_submission.to_csv('submission.csv',index=False)

In [30]:
DF_submission.groupby(['Transported']).count()

Unnamed: 0_level_0,PassengerId
Transported,Unnamed: 1_level_1
False,1970
True,2307


In [31]:
# comparison to best entry

DF=pd.read_csv('submission_best.csv')

In [32]:
DF.groupby(['Transported']).count()

Unnamed: 0_level_0,PassengerId
Transported,Unnamed: 1_level_1
False,1874
True,2403
