## **Data Loading and Exploration**

In [1]:
import pandas as pd

# Load the Titanic dataset from the CSV file
titanic = pd.read_csv('Titanic.csv')

# Display the first few rows of the dataset
print(titanic.head())

# Get a summary of the dataset
print(titanic.info())

# Get the statistics of the dataset
print(titanic.describe())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

## **Data Cleaning and Preprocessing**

In [2]:
# Handling missing values
# Fill missing values in the 'Age' column with the median
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)

# Fill missing values in the 'Embarked' column with the mode
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column due to many missing values
titanic.drop('Cabin', axis=1, inplace=True)

# Drop rows with missing values in 'Fare' column
titanic.dropna(subset=['Fare'], inplace=True)

# Convert categorical variables to numerical ones
titanic = pd.get_dummies(titanic, columns=['Sex', 'Embarked'], drop_first=True)

# Drop columns that won't be used
titanic.drop(['Name', 'Ticket'], axis=1, inplace=True)

# Display the cleaned dataset
print(titanic.head())


   PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  Sex_male  \
0            1         0       3  22.0      1      0   7.2500      True   
1            2         1       1  38.0      1      0  71.2833     False   
2            3         1       3  26.0      0      0   7.9250     False   
3            4         1       1  35.0      1      0  53.1000     False   
4            5         0       3  35.0      0      0   8.0500      True   

   Embarked_Q  Embarked_S  
0       False        True  
1       False       False  
2       False        True  
3       False        True  
4       False        True  


## **Feature Engineering**

In [3]:
# Feature engineering: Create a new feature 'family_size'
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1

# Drop the 'SibSp' and 'Parch' columns
titanic.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# Display the updated dataset
print(titanic.head())


   PassengerId  Survived  Pclass   Age     Fare  Sex_male  Embarked_Q  \
0            1         0       3  22.0   7.2500      True       False   
1            2         1       1  38.0  71.2833     False       False   
2            3         1       3  26.0   7.9250     False       False   
3            4         1       1  35.0  53.1000     False       False   
4            5         0       3  35.0   8.0500      True       False   

   Embarked_S  FamilySize  
0        True           2  
1       False           2  
2        True           1  
3        True           2  
4        True           1  


## **Model Building**

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into features and target
X = titanic.drop('Survived', axis=1)
y = titanic['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Accuracy: 0.8100558659217877
Confusion Matrix:
[[90 15]
 [19 55]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



## **Model Evaluation**

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the Titanic dataset
titanic = pd.read_csv('Titanic.csv')

# Display the first few rows of the dataset
print(titanic.head())

# Get a summary of the dataset
print(titanic.info())

# Get the statistics of the dataset
print(titanic.describe())

# Handling missing values
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)
titanic.drop('Cabin', axis=1, inplace=True)
titanic.dropna(subset=['Fare'], inplace=True)

# Convert categorical variables to numerical ones
titanic = pd.get_dummies(titanic, columns=['Sex', 'Embarked'], drop_first=True)

# Drop columns that won't be used
titanic.drop(['Name', 'Ticket'], axis=1, inplace=True)

# Feature engineering: Create a new feature 'family_size'
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# Display the cleaned dataset
print(titanic.head())

# Split the data into features and target
X = titanic.drop('Survived', axis=1)
y = titanic['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c