<a href="https://colab.research.google.com/github/wasifullah7/prediction_survival_on_titanic_dataset/blob/main/prediction_survival_on_titanic_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import pickle

from sklearn import set_config
set_config(display='diagram')
import warnings
warnings.filterwarnings("ignore")

In [40]:
df= pd.read_csv("train.csv")

In [41]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [42]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['Fare'] = pd.qcut(df['Fare'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

df.drop(['PassengerId', 'Cabin','Ticket','Name','SibSp','Parch'], axis=1, inplace=True)

In [43]:
df.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,male,22.0,Low,S,2
1,1,1,female,38.0,Very High,C,2
2,1,3,female,26.0,Medium,S,1
3,1,1,female,35.0,Very High,S,2
4,0,3,male,35.0,Medium,S,1
5,0,3,male,,Medium,Q,1
6,0,1,male,54.0,Very High,S,1
7,0,3,male,2.0,High,S,5
8,1,3,female,27.0,Medium,S,3
9,1,2,female,14.0,High,C,2


In [44]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
Fare,0
Embarked,2
FamilySize,0


In [45]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

In [46]:
#trf1: Handling Missing Values

trf1 = ColumnTransformer(
    transformers=[
        ("imputer_age", SimpleImputer(strategy="most_frequent"), [2]),
        ("imputer_embark", SimpleImputer(strategy="most_frequent"), [4])
    ],
    remainder='passthrough'
)

In [47]:
# trf2: Handling Categorical Values

trf2 = ColumnTransformer(
    transformers=[
        ("ohe",OneHotEncoder(dtype=np.int32, drop='first',handle_unknown="ignore"),[1,3,4]),
    ],
    remainder='passthrough'
)

In [48]:
# trf3: Feature Scaling

trf3 = ColumnTransformer(
    transformers=[
        ("scale", StandardScaler(), slice(0,15))
    ])

In [49]:
# trf4 Feature Selection
trf4 = RFE(estimator= RandomForestClassifier(), n_features_to_select=3)

In [50]:
# trf5: Model
trf5 = RandomForestClassifier()

In [51]:
# Creating Pipeline
pipe = Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",trf4),
    ("trf5",trf5)
])

In [52]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)


In [53]:
X_train


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize
793,1,male,,High,C,1
374,3,female,3.0,High,S,5
2,3,female,26.0,Medium,S,1
392,3,male,28.0,Medium,S,3
663,3,male,36.0,Low,S,1
...,...,...,...,...,...,...
42,3,male,,Low,C,1
296,3,male,23.5,Low,C,1
79,3,female,30.0,Medium,S,1
562,2,male,28.0,Medium,S,1


In [54]:
pipe.fit(X_train, y_train)

In [55]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('imputer_age',
                                  SimpleImputer(strategy='most_frequent'), [2]),
                                 ('imputer_embark',
                                  SimpleImputer(strategy='most_frequent'),
                                  [4])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe',
                                  OneHotEncoder(drop='first',
                                                dtype=<class 'numpy.int32'>,
                                                handle_unknown='ignore'),
                                  [1, 3, 4])]),
 'trf3': ColumnTransformer(transformers=[('scale', StandardScaler(),
                                  slice(0, 15, None))]),
 'trf4': RFE(estimator=RandomForestClassifier(), n_features_to_select=3),
 'trf5': RandomForestClassifier()}

In [56]:
y_pred = pipe.predict(X_test)

In [57]:
accuracy_score(y_test,y_pred)

0.8324022346368715

In [58]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring="accuracy").mean()


0.7822810991825075

In [59]:
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [60]:
grid = GridSearchCV(pipe, params, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)

In [61]:
grid.best_score_

0.8019895597360387

In [62]:
!pip install streamlit pandas scikit-learn




In [63]:
import pickle

# Assuming `pipe` is your trained model
pickle.dump(pipe, open("pipe_data.pkl", "wb"))

In [64]:
sex_mapping = {'Male': 0, 'Female': 1}
fare_mapping = {'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}

In [78]:
code = """
import streamlit as st
import pickle
import pandas as pd

# Load the trained pipeline
pipeline = pickle.load(open("pipe_data.pkl", "rb"))

# Function to make predictions
def predict_survival(pclass, sex, age, fare, embarked, family_size):
    # Encode categorical values
    sex_mapping = {'Male': 0, 'Female': 1}
    fare_mapping = {'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}

    # Convert input data
    sex = sex_mapping[sex]
    fare = fare_mapping[fare]

    # Create DataFrame
    data = pd.DataFrame({
        'Pclass': [pclass],
        'Sex': [sex],
        'Age': [age],
        'Fare': [fare],
        'Embarked': [embarked],
        'FamilySize': [family_size]
    })

    # Make predictions
    prediction = pipeline.predict(data)

    return "Survived" if prediction[0] == 1 else "Did Not Survive"

# Streamlit UI
def main():
    st.title('Titanic Survival Prediction')
    st.write('Enter the passenger details to predict survival.')

    # User inputs
    pclass = st.selectbox('Pclass', [1, 2, 3])
    sex = st.radio('Sex', ['Male', 'Female'])
    age = st.number_input('Age', min_value=0, max_value=100, value=25)
    fare = st.selectbox('Fare', ['Low', 'Medium', 'High', 'Very High'])
    embarked = st.selectbox('Embarked', ['C', 'Q', 'S'])
    family_size = st.number_input('Family Size', min_value=0, max_value=10, value=1)

    # Predict button
    st.write("Developed by Wasif Ullah")

    if st.button('Predict'):
        result = predict_survival(pclass, sex, age, fare, embarked, family_size)
        st.success(f'The model predicts: **{result}**')

if __name__ == '__main__':
    main()
"""

# Save as a Python file
with open("app.py", "w") as file:
    file.write(code)


In [79]:
from google.colab import files
uploaded = files.upload()

In [80]:
!pip install streamlit pyngrok




In [71]:
!pkill -f ngrok
!pkill -f streamlit


In [73]:

!ngrok authtoken 2tcfH1CkMIZOxrNXqFB4ATvs4qy_5fz99JmNbRk9HN3opb776


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [81]:
%%writefile app.py
import streamlit as st
import pickle
import pandas as pd

# Load the trained pipeline
pipeline = pickle.load(open("pipe_data.pkl", "rb"))

def predict_survival(pclass, sex, age, fare, embarked, family_size):
    sex_mapping = {'Male': 0, 'Female': 1}
    fare_mapping = {'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}

    # Convert input data
    sex = sex_mapping[sex]
    fare = fare_mapping[fare]

    # Create DataFrame
    data = pd.DataFrame({
        'Pclass': [pclass],
        'Sex': [sex],
        'Age': [age],
        'Fare': [fare],
        'Embarked': [embarked],
        'FamilySize': [family_size]
    })

    # Make predictions
    prediction = pipeline.predict(data)

    return "Survived" if prediction[0] == 1 else "Did Not Survive"

# Streamlit UI
def main():
    st.title('Titanic Survival Prediction')
    st.write('Enter the passenger details to predict survival.')

    pclass = st.selectbox('Pclass', [1, 2, 3])
    sex = st.radio('Sex', ['Male', 'Female'])
    age = st.number_input('Age', min_value=0, max_value=100, value=25)
    fare = st.selectbox('Fare', ['Low', 'Medium', 'High', 'Very High'])
    embarked = st.selectbox('Embarked', ['C', 'Q', 'S'])
    family_size = st.number_input('Family Size', min_value=0, max_value=10, value=1)
    st.write("🚀 Developed by Wasif Ullah")


    if st.button('Predict'):
        result = predict_survival(pclass, sex, age, fare, embarked, family_size)
        st.success(f'The model predicts: **{result}**')

if __name__ == '__main__':
    main()


Overwriting app.py


In [82]:
import os
import time
from pyngrok import ngrok

# Kill any running processes on port 8501
os.system("fuser -k 8501/tcp")

# Start Streamlit in the background
!nohup streamlit run app.py &

# Wait for Streamlit to start
time.sleep(5)

# Expose the Streamlit app to the internet
public_url = ngrok.connect(8501)
print(f"🚀 Streamlit app is running at: {public_url}")


nohup: appending output to 'nohup.out'




🚀 Streamlit app is running at: NgrokTunnel: "https://d937-35-197-35-238.ngrok-free.app" -> "http://localhost:8501"
