In [29]:
import pandas as pd

In [30]:
# Change the working directory to the 'titanic' folder within the 'Kaggle Competitions' directory on the desktop\n",
import os 
os.chdir('/Users/willhughes/Desktop/Kaggle  Competitions/titanic')


In [31]:
# Load the training and test datasets from the Titanic Kaggle competition
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Extract the 'PassengerId' column from the test dataset
test_ids = test_data["PassengerId"]

In [32]:
print(train_data.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [33]:
def extract_title(df):
# Extract titles from the 'Name' column
    df['Title'] = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
    return df

# Apply the function to both train and test datasets
train_data = extract_title(train_data)
test_data = extract_title(test_data)

In [34]:
# Display the first 5 rows of the training dataset to verify the 'Title' column has been added
print(train_data.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked Title  
0      0         A/5 21171   7.2500   NaN        S    Mr  
1      0          PC 17599  71.2833   C85        C   Mrs  
2      0  STON/O2. 3101282   7.9250   NaN        S  Miss  
3      0            113803  53.1000  C123        S   Mrs  
4      0            37345

In [35]:
# One-hot encode the 'Sex', 'Embarked', and 'Title' columns for both training and test datasets
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked', 'Title'])
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked', 'Title'])

# It's important to ensure that both datasets have the same columns after encoding, which might not be the case
# if there are categorical levels present in one dataset that aren't in the other. You can use the following
# approach to align them:

# Get missing columns in the test set
missing_cols = set(train_data.columns) - set(test_data.columns)
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_data[c] = 0

# Ensure the order of column in the test set is in the same order than in train set
test_data = test_data[train_data.columns]

# Now, you can safely drop 'Survived' from test_data as it's not present there and was added because of alignment
test_data.drop(['Survived'], axis=1, inplace=True)

# Let's take a look at the first few rows of the training dataset to verify the changes
print(train_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   
2                             Heikkinen, Miss. Laina  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   
4                           Allen, Mr. William Henry  35.0      0      0   

             Ticket     Fare Cabin  ...  Title_Master  Title_Miss  Title_Mlle  \
0         A/5 21171   7.2500   NaN  ...         False       False       False   
1          PC 17599  71.2833   C85  ...         False       False       False   
2  STON/O2. 3101282   7.9250   NaN  ...         False        True       False   
3 

In [36]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Impute missing values in 'Age' with the median
age_imputer = SimpleImputer(strategy='median')
train_data['Age'] = age_imputer.fit_transform(train_data[['Age']])
test_data['Age'] = age_imputer.transform(test_data[['Age']])

# For 'Cabin', since a lot of values are missing, you might want to convert it to a binary feature 
# indicating whether the cabin information was missing or not
train_data['Cabin_known'] = train_data['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
test_data['Cabin_known'] = test_data['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

# Drop the original 'Cabin' column as it's too sparse
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

# Check if there are any more missing values
print(train_data.isnull().sum())
print(test_data.isnull().sum())

PassengerId           0
Survived              0
Pclass                0
Name                  0
Age                   0
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Sex_female            0
Sex_male              0
Embarked_C            0
Embarked_Q            0
Embarked_S            0
Title_Capt            0
Title_Col             0
Title_Don             0
Title_Dr              0
Title_Jonkheer        0
Title_Lady            0
Title_Major           0
Title_Master          0
Title_Miss            0
Title_Mlle            0
Title_Mme             0
Title_Mr              0
Title_Mrs             0
Title_Ms              0
Title_Rev             0
Title_Sir             0
Title_the Countess    0
Cabin_known           0
dtype: int64
PassengerId           0
Pclass                0
Name                  0
Age                   0
SibSp                 0
Parch                 0
Ticket                0
Fare                  1
Sex_female            0
Sex

In [37]:
# Drop columns that are unique identifiers or don't contribute to the model
train_data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
test_data.drop(['Name', 'Ticket'], axis=1, inplace=True)  # Keep PassengerId in the test set for submission purposes

# Check the correlation of the features with the target
correlation_matrix = train_data.corr()
print(correlation_matrix["Survived"].sort_values(ascending=False))

# Based on the correlation and importance you can decide to drop more features

Survived              1.000000
Sex_female            0.543351
Title_Mrs             0.339040
Title_Miss            0.327093
Cabin_known           0.316912
Fare                  0.257307
Embarked_C            0.168240
Title_Master          0.085221
Parch                 0.081629
Title_Mlle            0.060095
Title_Lady            0.042470
Title_Mme             0.042470
Title_the Countess    0.042470
Title_Sir             0.042470
Title_Ms              0.042470
Title_Major           0.011329
Title_Col             0.011329
Title_Dr              0.008185
Embarked_Q            0.003650
Title_Don            -0.026456
Title_Jonkheer       -0.026456
Title_Capt           -0.026456
SibSp                -0.035322
Age                  -0.064910
Title_Rev            -0.064988
Embarked_S           -0.155660
Pclass               -0.338481
Sex_male             -0.543351
Title_Mr             -0.549199
Name: Survived, dtype: float64


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Separate features and target variable
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Predict on the validation set
y_pred = logreg.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Print the classification report for a detailed performance analysis
print(classification_report(y_val, y_pred))

Validation Accuracy: 0.8268156424581006
              precision    recall  f1-score   support

           0       0.86      0.85      0.85       105
           1       0.79      0.80      0.79        74

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179



In [40]:
print(test_data.isnull().sum())

PassengerId           0
Pclass                0
Age                   0
SibSp                 0
Parch                 0
Fare                  1
Sex_female            0
Sex_male              0
Embarked_C            0
Embarked_Q            0
Embarked_S            0
Title_Capt            0
Title_Col             0
Title_Don             0
Title_Dr              0
Title_Jonkheer        0
Title_Lady            0
Title_Major           0
Title_Master          0
Title_Miss            0
Title_Mlle            0
Title_Mme             0
Title_Mr              0
Title_Mrs             0
Title_Ms              0
Title_Rev             0
Title_Sir             0
Title_the Countess    0
Cabin_known           0
dtype: int64


In [41]:
from sklearn.impute import SimpleImputer

# Create an imputer object with a median filling strategy
fare_imputer = SimpleImputer(strategy='median')

# Impute the missing values in 'Fare'
test_data['Fare'] = fare_imputer.fit_transform(test_data[['Fare']])

# Now, try making predictions again
test_predictions = logreg.predict(test_data.drop('PassengerId', axis=1))

In [42]:
# Make predictions on the test data
test_predictions = logreg.predict(test_data.drop('PassengerId', axis=1))

In [43]:
# Create a DataFrame for submission
submission = pd.DataFrame({
    "PassengerId": test_data['PassengerId'],
    "Survived": test_predictions
})

In [44]:
# Save the submission file
submission.to_csv('titanic_submission.csv', index=False)