In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("C:\\Users\\u725561\\machine-learning\\spaceship-titanic\\train.csv")

In [3]:
# Split the PassengerId into 2 columns 
df[['Group','NumInGroup']] = df["PassengerId"].str.split("_",expand=True)

In [4]:
# Split the Cabin column into 3 columns
df[['CabinDeck','CabinNum','CabinSide']] = df["Cabin"].str.split("/",expand=True)

In [5]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,NumInGroup,CabinDeck,CabinNum,CabinSide
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,01,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,01,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,01,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,02,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,01,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,01,A,98,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,01,G,1499,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,01,G,1500,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,01,E,608,S


In [6]:
# drop the original column PassengerId and Cabin .
# also drop name , does not have significance
df.drop("Cabin", axis = 1 ,inplace=True)
df.drop("Name",axis=1,inplace=True)
df.drop("PassengerId",axis=1,inplace=True)

# seperate out Transported column
tran_y= df.pop("Transported")

In [7]:
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group,NumInGroup,CabinDeck,CabinNum,CabinSide
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0001,01,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,0002,01,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0003,01,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0003,02,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,0004,01,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,9276,01,A,98,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,9278,01,G,1499,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,9279,01,G,1500,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,9280,01,E,608,S


In [8]:
type(tran_y)

pandas.core.series.Series

In [9]:
y = pd.DataFrame(tran_y)

In [10]:
type(y)

pandas.core.frame.DataFrame

In [11]:
y.columns

Index(['Transported'], dtype='object')

In [12]:
y['Transported'] = y['Transported'].astype(int)

In [14]:
y = y.values
y = y.ravel()

In [15]:
y

array([0, 1, 0, ..., 1, 0, 1])

In [16]:
type(y)

numpy.ndarray

Check columns which has null . 
we will do *preprocessing* on the dataframe .

In [17]:
null_columns = df.columns[df.isna().any()].tolist()
print(null_columns)

['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinDeck', 'CabinNum', 'CabinSide']


preprocessing data for catrgorical and numerical columns


In [18]:
# Separate features into numerical and categorical
numerical_features = df.select_dtypes(include=['float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns
print(numerical_features)
print(categorical_features)

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')
Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Group', 'NumInGroup',
       'CabinDeck', 'CabinNum', 'CabinSide'],
      dtype='object')


In [19]:
# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform the data using the preprocessor
X_preprocessed = preprocessor.fit_transform(df)
# size of X_preprocessed
X_preprocessed.shape
# get output feature names
feature_names = preprocessor.get_feature_names_out()
feature_names


array(['num__Age', 'num__RoomService', 'num__FoodCourt', ...,
       'cat__CabinNum_999', 'cat__CabinSide_P', 'cat__CabinSide_S'],
      dtype=object)

In [20]:
X_preprocessed.shape

(8693, 8068)

In [21]:
type(X_preprocessed)

scipy.sparse._csr.csr_matrix

In [22]:
type(X_preprocessed)
X_preprocessed = X_preprocessed.toarray()

In [23]:
X_preprocessed.shape

(8693, 8068)

In [24]:
type(X_preprocessed)

numpy.ndarray

In [25]:
X_preprocessed_df = pd.DataFrame(data=X_preprocessed, columns=feature_names)

# Display the preprocessed DataFrame
print(X_preprocessed_df)

      num__Age  num__RoomService  num__FoodCourt  num__ShoppingMall  num__Spa  \
0     0.709437         -0.340590       -0.287314          -0.290817 -0.276663   
1    -0.336717         -0.175364       -0.281669          -0.248968  0.211505   
2     2.034566         -0.275409        1.955616          -0.290817  5.694289   
3     0.290975         -0.340590        0.517406           0.330225  2.683471   
4    -0.894666          0.118709       -0.243409          -0.038048  0.225732   
...        ...               ...             ...                ...       ...   
8688  0.848924         -0.340590        3.989682          -0.290817  1.184286   
8689 -0.755179         -0.340590       -0.287314          -0.290817 -0.276663   
8690 -0.197230         -0.340590       -0.287314           2.842851 -0.275774   
8691  0.221232         -0.340590        0.370637          -0.290817  0.037223   
8692  1.058155         -0.149594        2.653082          -0.290817 -0.276663   

      num__VRDeck  cat__Hom

In [26]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [29]:
# Create a Logistic Regression model
model = LogisticRegression(random_state=42,solver='liblinear')

In [30]:
# Train the model on the training set
model.fit(X_train, y_train)

In [31]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [32]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [33]:
# Print the results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Accuracy: 0.7866589994249569

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.77      0.78       861
           1       0.78      0.81      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739

