## Imports

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

## Constants with respect to the file

In [2]:
BASE_DIR = Path.cwd().parent

TRAIN_DATA_FILE_PATH = BASE_DIR / 'datasets/train.csv'


# EDA

In [3]:
train_df=pd.read_csv(TRAIN_DATA_FILE_PATH)

In [4]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


- Converting Categorical feature columns into dummy features

In [5]:
CATEGORICAL_COLS=[
    'HomePlanet',
    'Destination'
]

# Replace missing values in categorical columns with 'Unknown'
train_df[CATEGORICAL_COLS] = train_df[CATEGORICAL_COLS].fillna('Unknown')

# Apply one-hot encoding
train_df = pd.get_dummies(train_df, columns=CATEGORICAL_COLS)

train_df.columns

Index(['PassengerId', 'CryoSleep', 'Cabin', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_Unknown', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_Unknown'],
      dtype='object')

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PassengerId                8693 non-null   object 
 1   CryoSleep                  8476 non-null   object 
 2   Cabin                      8494 non-null   object 
 3   Age                        8514 non-null   float64
 4   VIP                        8490 non-null   object 
 5   RoomService                8512 non-null   float64
 6   FoodCourt                  8510 non-null   float64
 7   ShoppingMall               8485 non-null   float64
 8   Spa                        8510 non-null   float64
 9   VRDeck                     8505 non-null   float64
 10  Name                       8493 non-null   object 
 11  Transported                8693 non-null   bool   
 12  HomePlanet_Earth           8693 non-null   bool   
 13  HomePlanet_Europa          8693 non-null   bool 

In [7]:
train_df.head()

Unnamed: 0,PassengerId,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_Unknown
0,0001_01,False,B/0/P,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,False,True,False,False,False,False,True,False
1,0002_01,False,F/0/S,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,True,False,False,False,False,False,True,False
2,0003_01,False,A/0/S,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,False,True,False,False,False,False,True,False
3,0003_02,False,A/0/S,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,False,True,False,False,False,False,True,False
4,0004_01,False,F/1/S,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,True,False,False,False,False,False,True,False


In [8]:
BOOLEAN_FEATURES=[
    # 'CryoSleep',
    # 'VIP',
    'HomePlanet_Earth',
    'HomePlanet_Europa',
    'HomePlanet_Mars',
    'HomePlanet_Unknown',

    'Destination_55 Cancri e',
    'Destination_PSO J318.5-22',
    'Destination_TRAPPIST-1e',
    'Destination_Unknown',

    'Transported',
]

train_df[BOOLEAN_FEATURES]=train_df[BOOLEAN_FEATURES].astype(int)
train_df.isnull().sum()

PassengerId                    0
CryoSleep                    217
Cabin                        199
Age                          179
VIP                          203
RoomService                  181
FoodCourt                    183
ShoppingMall                 208
Spa                          183
VRDeck                       188
Name                         200
Transported                    0
HomePlanet_Earth               0
HomePlanet_Europa              0
HomePlanet_Mars                0
HomePlanet_Unknown             0
Destination_55 Cancri e        0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
Destination_Unknown            0
dtype: int64

In [9]:
COLS_TO_DROP=[
    'PassengerId',
    'Name',
    'Cabin'
]

train_df.drop(columns=COLS_TO_DROP, axis=1, inplace=True)
train_df.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'HomePlanet_Unknown', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_Unknown'],
      dtype='object')

In [10]:
X=train_df.drop('Transported', axis=1)
y=train_df['Transported']

In [11]:
logreg_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),  
    LogisticRegression() 
)


In [12]:
logreg_pipeline.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Preparing Testing Data

In [13]:
TEST_DATA_FILE_PATH = BASE_DIR / 'datasets/test.csv'

In [14]:
test_df=pd.read_csv(TEST_DATA_FILE_PATH)

In [15]:
test_df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

In [16]:
# Replace missing values in categorical columns with 'Unknown'
test_df[CATEGORICAL_COLS] = test_df[CATEGORICAL_COLS].fillna('Unknown')

# Apply one-hot encoding
test_df = pd.get_dummies(test_df, columns=CATEGORICAL_COLS)

TEST_BOOLEAN_FEATURES=[
    'HomePlanet_Earth',
    'HomePlanet_Europa',
    'HomePlanet_Mars',
    'HomePlanet_Unknown',

    'Destination_55 Cancri e',
    'Destination_PSO J318.5-22',
    'Destination_TRAPPIST-1e',
    'Destination_Unknown',
]

test_df[TEST_BOOLEAN_FEATURES]=test_df[TEST_BOOLEAN_FEATURES].astype(int)

Passenger_Ids=test_df['PassengerId']

test_df.drop(columns=COLS_TO_DROP, axis=1, inplace=True)

In [17]:
test_df.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'HomePlanet_Unknown', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_Unknown'],
      dtype='object')

In [18]:
predictions=logreg_pipeline.predict(test_df)

In [19]:
Transported=pd.Series(predictions, name='Transported')
Transported.count()

np.int64(4277)

In [20]:
Passenger_Ids.count()

np.int64(4277)

In [21]:
submission = pd.concat([Passenger_Ids, Transported], axis=1)

In [22]:
submission['Transported']=submission['Transported'].astype(bool)
submission.head()

In [23]:
OUTPUT_FILE_PATH = BASE_DIR / 'output/submission1.csv'
submission.to_csv(OUTPUT_FILE_PATH, index=False)