In [35]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [36]:
def replace_nan_with_existing_val(df, column):
    nan_indices = df.index[df[column].isna()].tolist()
    num_nans = len(nan_indices)
    # check where values are not Nan values , that will make a replacement_list
    value_index = df.index[df[column].notna()].tolist()
    replacement_list = df.loc[value_index,column].values.tolist()
    random_values = np.random.choice(replacement_list, size=num_nans)
    df.loc[nan_indices, column] = random_values
    

In [37]:
def preprocess_dataframe(numerical_features, categorical_features, df):
    """
    This functions takes a dataframe and takes a list of numerical columns and categorical columns 
    and replaces NAN values and scales the values 
    
    """
    # Create transformers for numerical and categorical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Fit and transform the data using the preprocessor
    X_preprocessed = preprocessor.fit_transform(df)
    # get output features name 
    feature_names = preprocessor.get_feature_names_out()
    
    return ( X_preprocessed ,feature_names)



In [38]:
def extract_target_column(df,target_col):
    """
      function extracts target column from dataframe .
      converts it to integer and returns post flattening.
    """
    tran_y= df.pop(target_col)
    # type would be pandas.core.series.Series
    type(tran_y)
    y = pd.DataFrame(tran_y)
    y[target_col] = y[target_col].astype(int)
    y = y.values
    y = y.ravel()
    return y

In [39]:
df = pd.read_csv("C:\\Users\\u725561\\machine-learning\\spaceship-titanic\\train.csv")

Check how many unique Homeplanets are present

In [40]:
df.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [41]:
# check count of unique home planets
df.HomePlanet.nunique()

3

In [42]:
# Split the PassengerId into 2 columns 
df[['Group','NumInGroup']] = df["PassengerId"].str.split("_",expand=True)

In [43]:
# Split the Cabin column into 3 columns
df[['CabinDeck','CabinNum','CabinSide']] = df["Cabin"].str.split("/",expand=True)

In [44]:
# check the distribution of Cabin Deck . Seems F , G are more frequent
df.CabinDeck.value_counts()

CabinDeck
F    2794
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: count, dtype: int64

In [11]:
df.CabinNum.info()

<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: CabinNum
Non-Null Count  Dtype 
--------------  ----- 
8494 non-null   object
dtypes: object(1)
memory usage: 68.0+ KB


In [45]:
df.CabinNum.value_counts()

CabinNum
82      28
86      22
19      22
56      21
176     21
        ..
1644     1
1515     1
1639     1
1277     1
1894     1
Name: count, Length: 1817, dtype: int64

In [46]:
# find total number of NaN values in the column CabinNum
df.CabinNum.isna().sum()

199

In [47]:
# find out indices for NaN values for Cabin Num
cabinNum_nan = df.index[df.CabinNum.isna()].tolist()
len(cabinNum_nan)

199

In [48]:
# Observe CabinNum is object , which is string , need to convert to int
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
Group            object
NumInGroup       object
CabinDeck        object
CabinNum         object
CabinSide        object
dtype: object

In [49]:
#CabinNum is a object
df["CabinNum"].dtype

dtype('O')

In [50]:
replace_nan_with_existing_val(df,"CabinNum")

In [51]:
df["CabinNum"] = df.CabinNum.astype('int')

In [52]:
df.CabinNum.dtype

dtype('int32')

In [53]:
# check the distribution of CabinSide . It has only 2 unique values S and P . Probably should drop ?
df.CabinSide.value_counts()

CabinSide
S    4288
P    4206
Name: count, dtype: int64

In [54]:
# Check whether the target variable is distributed uniformly . would observe that almost evenly distributed
df.Transported.value_counts()

Transported
True     4378
False    4315
Name: count, dtype: int64

In [55]:
# print entire dataframe with new columns
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,NumInGroup,CabinDeck,CabinNum,CabinSide
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,01,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,01,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,01,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,02,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,01,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,01,A,98,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,01,G,1499,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,01,G,1500,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,01,E,608,S


In [56]:
# drop the original column PassengerId and Cabin .
# also drop name , does not have significance
df.drop( ["Cabin", "name", "PassengerId"], axis = 1 ,inplace=True)


KeyError: "['name'] not found in axis"

In [57]:
# extract target column into a numpy array . This would go into fit as argument 
y = extract_target_column(df,"Transported")
y

array([0, 1, 0, ..., 1, 0, 1])

In [58]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,NumInGroup,CabinDeck,CabinNum,CabinSide
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0001,01,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,0002,01,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0003,01,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0003,02,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,0004,01,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,9276,01,A,98,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,9278,01,G,1499,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,9279,01,G,1500,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,9280,01,E,608,S


In [59]:
df.shape

(8693, 15)

In [62]:
df.Destination.unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [32]:
type(y)

numpy.ndarray

Check columns which has null . 
we will do *preprocessing* on the dataframe .

In [33]:
null_columns = df.columns[df.isna().any()].tolist()
print(null_columns)

['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinDeck', 'CabinNum', 'CabinSide']


preprocessing data for catrgorical and numerical columns


In [34]:
# Separate features into numerical and categorical
numerical_features = df.select_dtypes(include=['float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns
print(numerical_features)
print(categorical_features)

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')
Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Group', 'NumInGroup',
       'CabinDeck', 'CabinNum', 'CabinSide'],
      dtype='object')


In [35]:
# Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform the data using the preprocessor
X_preprocessed = preprocessor.fit_transform(df)
# size of X_preprocessed
X_preprocessed.shape
# get output feature names
feature_names = preprocessor.get_feature_names_out()
feature_names


array(['num__Age', 'num__RoomService', 'num__FoodCourt', ...,
       'cat__CabinNum_999', 'cat__CabinSide_P', 'cat__CabinSide_S'],
      dtype=object)

In [36]:
# check that preprocessing has increased the number of coluns
X_preprocessed.shape

(8693, 8068)

In [37]:
type(X_preprocessed)

scipy.sparse._csr.csr_matrix

In [38]:
# convert to numpy array from scipy array
type(X_preprocessed)
X_preprocessed = X_preprocessed.toarray()

In [39]:
X_preprocessed.shape

(8693, 8068)

In [40]:
type(X_preprocessed)

numpy.ndarray

In [41]:
X_preprocessed_df = pd.DataFrame(data=X_preprocessed, columns=feature_names)

# Display the preprocessed DataFrame
print(X_preprocessed_df)

      num__Age  num__RoomService  num__FoodCourt  num__ShoppingMall  num__Spa  \
0     0.709437         -0.340590       -0.287314          -0.290817 -0.276663   
1    -0.336717         -0.175364       -0.281669          -0.248968  0.211505   
2     2.034566         -0.275409        1.955616          -0.290817  5.694289   
3     0.290975         -0.340590        0.517406           0.330225  2.683471   
4    -0.894666          0.118709       -0.243409          -0.038048  0.225732   
...        ...               ...             ...                ...       ...   
8688  0.848924         -0.340590        3.989682          -0.290817  1.184286   
8689 -0.755179         -0.340590       -0.287314          -0.290817 -0.276663   
8690 -0.197230         -0.340590       -0.287314           2.842851 -0.275774   
8691  0.221232         -0.340590        0.370637          -0.290817  0.037223   
8692  1.058155         -0.149594        2.653082          -0.290817 -0.276663   

      num__VRDeck  cat__Hom

In [42]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [43]:
# Create a Logistic Regression model
model = LogisticRegression(random_state=42,solver='liblinear')

In [44]:
# Train the model on the training set
model.fit(X_train, y_train)

In [45]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [46]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [47]:
# Print the results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Accuracy: 0.7866589994249569

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.77      0.78       861
           1       0.78      0.81      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [48]:
# perform evaluation on test data 
df_test = pd.read_csv("C:\\Users\\u725561\\machine-learning\\spaceship-titanic\\test.csv")

In [49]:
# As you would observe that it is test data and it does not have "Transported" column
df_test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [50]:
# observe tha test data also has lot of nulls
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [52]:
# Split the PassengerId into 2 columns 
df_test[['Group','NumInGroup']] = df_test["PassengerId"].str.split("_",expand=True)

In [53]:
# Split the Cabin column into 3 columns
df_test[['CabinDeck','CabinNum','CabinSide']] = df_test["Cabin"].str.split("/",expand=True)

In [55]:
# expect 5 new columns to get added , total = 18
df_test.shape

(4277, 18)

In [56]:
# drop the original column PassengerId and Cabin .
# also drop name , does not have significance
df_test.drop("Cabin", axis = 1 ,inplace=True)
df_test.drop("Name",axis=1,inplace=True)
df_test.drop("PassengerId",axis=1,inplace=True)

In [57]:
# columns dropped to 15 
df_test.shape

(4277, 15)

In [58]:
# Separate features into numerical and categorical
numerical_features = df_test.select_dtypes(include=['float64']).columns
categorical_features = df_test.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform the data using the preprocessor
X_preprocessed_test = preprocessor.fit_transform(df_test)
# size of X_preprocessed
X_preprocessed_test.shape

(4277, 4602)

In [71]:
df_test.CabinNum.info()

<class 'pandas.core.series.Series'>
RangeIndex: 4277 entries, 0 to 4276
Series name: CabinNum
Non-Null Count  Dtype 
--------------  ----- 
4177 non-null   object
dtypes: object(1)
memory usage: 33.5+ KB


In [73]:
# Cabin as coming as object 
df_test.CabinNum

0          3
1          4
2          0
3          1
4          5
        ... 
4272    1496
4273     NaN
4274     296
4275     297
4276    1498
Name: CabinNum, Length: 4277, dtype: object

In [70]:
categorical_features

Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Group', 'NumInGroup',
       'CabinDeck', 'CabinNum', 'CabinSide'],
      dtype='object')

In [60]:
df_test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group,NumInGroup,CabinDeck,CabinNum,CabinSide
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,0013,01,G,3,S
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,0018,01,F,4,S
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,0019,01,C,0,S
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,0021,01,C,1,S
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,0023,01,F,5,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,9266,02,G,1496,S
4273,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,9269,01,,,
4274,Mars,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,9271,01,D,296,P
4275,Europa,False,,,False,0.0,2680.0,0.0,0.0,523.0,9273,01,D,297,P


In [63]:
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group,NumInGroup,CabinDeck,CabinNum,CabinSide
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0001,01,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,0002,01,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0003,01,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0003,02,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,0004,01,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,9276,01,A,98,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,9278,01,G,1499,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,9279,01,G,1500,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,9280,01,E,608,S


In [61]:
# this column may be different between test data and train data
df_test.Destination.unique()

array(['TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22', nan], dtype=object)

In [64]:
df.CabinDeck.unique()

array(['B', 'F', 'A', 'G', nan, 'E', 'D', 'C', 'T'], dtype=object)

In [66]:
df.CabinDeck.value_counts()

CabinDeck
F    2794
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: count, dtype: int64

In [67]:
df_test.CabinDeck.value_counts()

CabinDeck
F    1445
G    1222
E     447
B     362
C     355
D     242
A      98
T       6
Name: count, dtype: int64

In [68]:
df.CabinNum.value_counts()

CabinNum
82      28
86      22
19      22
56      21
176     21
        ..
1644     1
1515     1
1639     1
1277     1
1894     1
Name: count, Length: 1817, dtype: int64

In [69]:
df_test.CabinNum.value_counts()

CabinNum
4       21
31      18
197     16
294     16
228     14
        ..
1170     1
904      1
1174     1
356      1
1503     1
Name: count, Length: 1505, dtype: int64