In [22]:
# importing step
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler



In [23]:
# Loading data
train = pd.read_csv('train.csv')


In [24]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [25]:
train.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [26]:
## Relevent features and target
features = ["HomePlanet","CryoSleep","Cabin","Destination","Age","VIP","RoomService","FoodCourt","ShoppingMall","Spa","VRDeck","Name"]
target = ["Transported"]

In [27]:
# Seperate features and target
X = train[features]
y = train[target]

In [28]:
y = y.squeeze()

In [29]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
numeric_features =  X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # handle numeric missing data
    ('scaler', StandardScaler()), # scale data
])


In [31]:
categorical_features = (X.select_dtypes(include=['object']).columns)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # handle categorical missing data
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # handle categorical data
])

In [32]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ])


In [33]:
# Create a KerasClassifier instance
from xgboost import XGBClassifier
#classifier = XGBClassifier()
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, max_iter=1000)

In [34]:
pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier', classifier)
])

In [35]:

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

In [36]:
X_train.isna().sum()

HomePlanet      168
CryoSleep       177
Cabin           158
Destination     139
Age             148
VIP             162
RoomService     126
FoodCourt       140
ShoppingMall    165
Spa             134
VRDeck          151
Name            159
dtype: int64

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Use the pipeline to make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Print confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix: \n{conf_mat}")

Accuracy: 0.7734330074755607
Precision: 0.765934065934066
Recall: 0.7938496583143508
F1 Score: 0.7796420581655481
Confusion Matrix: 
[[648 213]
 [181 697]]


In [38]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = pipeline, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 79.03 %
Standard Deviation: 1.41 %


In [39]:
# Load the test data
test_data = pd.read_csv('test.csv')

# Make predictions on the test data
test_predictions = pipeline.predict(test_data[features])

# Create a DataFrame with the passenger IDs from the test data and the predicted survival values
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': test_predictions})

# Save the DataFrame to a CSV file
output.to_csv('my_submission3.csv', index=False)
