In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV



In [2]:
 #Load the Titanic dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
gender_submission_df = pd.read_csv('gender_submission.csv')


In [3]:
combined_df = pd.concat([train_df.drop('Survived', axis=1), test_df], axis=0)


In [4]:
# Check for missing data
print(combined_df.isnull().sum())

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


In [5]:
# Separate numerical and categorical features
numeric_features = combined_df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = combined_df.select_dtypes(include=['object']).columns

# Separate features and target variable for the train set
X_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]

# Separate features for the test set
X_test = test_df.copy()

# Separate numerical features
X_train_numeric = X_train[numeric_features]
X_test_numeric = X_test[numeric_features]

# Separate categorical features
X_train_categorical = X_train[categorical_features]
X_test_categorical = X_test[categorical_features]

# Preprocess numerical features
scaler = StandardScaler()
X_train_numeric_preprocessed = scaler.fit_transform(X_train_numeric)
X_test_numeric_preprocessed = scaler.transform(X_test_numeric)

# Preprocess categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_categorical_preprocessed = encoder.fit_transform(X_train_categorical).toarray()
X_test_categorical_preprocessed = encoder.transform(X_test_categorical).toarray()

# Concatenate preprocessed numerical and categorical features
X_train_preprocessed = pd.concat([pd.DataFrame(X_train_numeric_preprocessed, columns=numeric_features),
                                  pd.DataFrame(X_train_categorical_preprocessed, columns=encoder.get_feature_names_out(categorical_features))],
                                  axis=1)

X_test_preprocessed = pd.concat([pd.DataFrame(X_test_numeric_preprocessed, columns=numeric_features),
                                 pd.DataFrame(X_test_categorical_preprocessed, columns=encoder.get_feature_names_out(categorical_features))],
                                 axis=1)

In [6]:
# Impute missing values for numerical features
numeric_imputer = SimpleImputer(strategy='mean')
X_train_numeric_imputed = pd.DataFrame(numeric_imputer.fit_transform(X_train_numeric), columns=numeric_features)
X_test_numeric_imputed = pd.DataFrame(numeric_imputer.transform(X_test_numeric), columns=numeric_features)

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train_categorical_imputed = pd.DataFrame(categorical_imputer.fit_transform(X_train_categorical), columns=categorical_features)
X_test_categorical_imputed = pd.DataFrame(categorical_imputer.transform(X_test_categorical), columns=categorical_features)

# Preprocess numerical features
scaler = StandardScaler()
X_train_numeric_preprocessed = scaler.fit_transform(X_train_numeric_imputed)
X_test_numeric_preprocessed = scaler.transform(X_test_numeric_imputed)

# Preprocess categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_categorical_preprocessed = encoder.fit_transform(X_train_categorical_imputed).toarray()
X_test_categorical_preprocessed = encoder.transform(X_test_categorical_imputed).toarray()

# Concatenate preprocessed numerical and categorical features
X_train_preprocessed = pd.concat([pd.DataFrame(X_train_numeric_preprocessed, columns=numeric_features),
                                  pd.DataFrame(X_train_categorical_preprocessed, columns=encoder.get_feature_names_out(categorical_features))],
                                  axis=1)

X_test_preprocessed = pd.concat([pd.DataFrame(X_test_numeric_preprocessed, columns=numeric_features),
                                 pd.DataFrame(X_test_categorical_preprocessed, columns=encoder.get_feature_names_out(categorical_features))],
                                 axis=1)



In [7]:
X_train.head

<bound method NDFrame.head of      PassengerId  Pclass                                               Name  \
0              1       3                            Braund, Mr. Owen Harris   
1              2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2              3       3                             Heikkinen, Miss. Laina   
3              4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4              5       3                           Allen, Mr. William Henry   
..           ...     ...                                                ...   
886          887       2                              Montvila, Rev. Juozas   
887          888       1                       Graham, Miss. Margaret Edith   
888          889       3           Johnston, Miss. Catherine Helen "Carrie"   
889          890       1                              Behr, Mr. Karl Howell   
890          891       3                                Dooley, Mr. Patrick   

        Sex   Age  Si

In [8]:
logistic_regression_model = LogisticRegression(random_state=42)

logistic_regression_model.fit(X_train_preprocessed, y_train)


In [9]:
#Model evaluation

y_test_true = gender_submission_df['Survived']
y_test_pred = logistic_regression_model.predict(X_test_preprocessed)


In [10]:
#evaluate
print("Accuracy:", accuracy_score(y_test_true, y_test_pred))
print("Precision:", precision_score(y_test_true, y_test_pred))
print("Recall:", recall_score(y_test_true, y_test_pred))
print("F1 Score:", f1_score(y_test_true, y_test_pred))

Accuracy: 0.9354066985645934
Precision: 0.8881987577639752
Recall: 0.9407894736842105
F1 Score: 0.9137380191693291


In [12]:
#(you can choose any row)
passenger_to_predict = X_test_preprocessed.iloc[[12]]  


survival_probability = logistic_regression_model.predict_proba(passenger_to_predict)[:, 1][0]


print(f"The predicted probability of survival for the selected passenger is: {survival_probability:.4f}")


survival_prediction = logistic_regression_model.predict(passenger_to_predict)[0]


print(f"The  prediction for survival is: {survival_prediction}")


The predicted probability of survival for the selected passenger is: 0.9547
The  prediction for survival is: 1


In [None]:
#Documantation
# I started with importing the libaries then exploraing the data which was challenging but i enjoyed it the cleaning proccess is what took the most time but i got the hung of it in the end
#modeling i choose the logistic regression because it was easy for an binary classification program
#after evaluation i did a prediction program to check if the program is doing good after i got an accuracy of 93% 
#thank you , it really was fun and helped me to kearn new things 
