<a href="https://colab.research.google.com/github/youssefwael397/titanic-decision-tree/blob/main/regression-decition-tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import csv

def generate_new_test_dataset(test_file, submission_file):
    
  # Specify the path of the output file
  output_file = '/content/drive/MyDrive/titanic-dataset/test_with_survived.csv'


  # Read the submission data and store the Survived values in a dictionary
  survived_dict = {}
  with open(submission_file, "r") as submission_csv:
      reader = csv.DictReader(submission_csv)
      for row in reader:
          passenger_id = row["PassengerId"]
          survived = row["Survived"]
          survived_dict[passenger_id] = survived

  # Open the test.csv file in read mode and the output file in write mode
  with open(test_file, "r") as test_csv, open(output_file, "w", newline="") as output_csv:
      reader = csv.DictReader(test_csv)
      fieldnames = reader.fieldnames + ["Survived"]

      writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
      writer.writeheader()

      # Process each row in the test.csv file
      for row in reader:
          passenger_id = row["PassengerId"]

          # Check if the passenger_id exists in the survived_dict
          if passenger_id in survived_dict:
              survived = survived_dict[passenger_id]
          else:
              survived = ""  # Handle missing Survived values if needed

          row["Survived"] = survived
          writer.writerow(row)

  print("Column 'Survived' added successfully to the file.")


In [45]:
# Preparing the Titanic dataset

# Imports needed for the script
import numpy as np
import pandas as pd
import re
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score



%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont

drive.mount('/content/drive/')

# Loading the data
train = pd.read_csv('/content/drive/MyDrive/titanic-dataset/train.csv')
test_without_survived = '/content/drive/MyDrive/titanic-dataset/test.csv'
submission_file = '/content/drive/MyDrive/titanic-dataset/gender_submission.csv'

# generate the test_with_survived.csv file
generate_new_test_dataset(test_without_survived, submission_file)
# read the corrected dataset
test = pd.read_csv('/content/drive/MyDrive/titanic-dataset/test_with_survived.csv')

# Showing overview of the train dataset
train.head(3)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Column 'Survived' added successfully to the file.


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [46]:
# Make a copy of the original dataset
original_train = train.copy()

# Feature engineering steps
full_data = [train, test]

# Feature: Has_Cabin
for dataset in full_data:
    dataset['Has_Cabin'] = dataset['Cabin'].notnull().astype(int)

# Feature: FamilySize
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# Feature: IsAlone
for dataset in full_data:
    dataset['IsAlone'] = (dataset['FamilySize'] == 1).astype(int)

# Fill missing values
for dataset in full_data:
    dataset['Embarked'].fillna('S', inplace=True)
    dataset['Fare'].fillna(train['Fare'].median(), inplace=True)
    dataset['Age'].fillna(dataset['Age'].mean(), inplace=True)

# Extract title from Name
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

# Group non-common titles into 'Rare'
for dataset in full_data:
    dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major',
                              'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare', inplace=True)
    dataset['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
    dataset['Title'].replace('Mme', 'Mrs', inplace=True)

# Mapping categorical features
for dataset in full_data:
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    dataset['Title'] = dataset['Title'].map({'Mr': 1, 'Master': 2, 'Mrs': 3, 'Miss': 4, 'Rare': 5}).fillna(0).astype(int)
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

# Mapping Fare and Age into bins
for dataset in full_data:
    dataset['Fare'] = pd.cut(dataset['Fare'], bins=[-np.inf, 7.91, 14.454, 31, np.inf], labels=False)
    dataset['Age'] = pd.cut(dataset['Age'], bins=[-np.inf, 16, 32, 48, 64, np.inf], labels=False)

# Remove unnecessary columns
drop_columns = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train.drop(drop_columns, axis=1, inplace=True)
test.drop(drop_columns, axis=1, inplace=True)

train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,1,1,1,0,0,0,0,2,0,1
1,1,1,0,2,1,0,3,1,1,2,0,3
2,1,3,0,1,0,0,1,0,0,1,1,4


In [47]:
y = train['Survived']
X = train.drop(['Survived'], axis=1).values
# y_test = test['Survived']
# x_test = test.drop(['Survived'], axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

print(f"y_pred: {y_pred}")

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)

# Convert predicted values to binary class labels
y_pred_class = np.where(y_pred >= 0.5, 1, 0)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_class)

# Calculate precision
precision = precision_score(y_test, y_pred_class)

# Calculate recall
recall = recall_score(y_test, y_pred_class)

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("Mean Absolute Error (MAE):", mae)


y_pred: [0.5        0.08695652 0.125      1.         0.         1.
 0.71428571 0.         1.         1.         0.25       0.15384615
 0.         0.2        0.125      1.         0.25       0.71428571
 0.         0.         0.12280702 1.         0.66666667 0.125
 0.         0.         0.         0.08695652 0.         0.66666667
 0.12280702 0.66666667 0.         0.66666667 0.125      0.
 0.2        0.71428571 1.         0.12280702 0.         0.2
 0.125      0.11111111 0.66666667 0.         0.125      0.125
 0.12280702 1.         1.         1.         0.         1.
 0.         1.         0.08695652 1.         1.         0.71428571
 0.12280702 1.         0.8        0.5        0.11111111 1.
 0.         0.12280702 0.         1.         1.         1.
 0.5        1.         0.12280702 0.         0.71428571 1.
 1.         1.         0.         1.         1.         0.11111111
 0.         0.         1.         1.         0.         0.12280702
 1.         0.25       0.         0.11111111 0.12280