In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [2]:
train_data = pd.read_csv('Titanic_train.csv')
test_data = pd.read_csv('Titanic_test.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [7]:
train_data.drop(columns = ['Survived' , 'Cabin'], inplace = True)
test_data.drop(columns = ['Cabin'], inplace = True)

In [8]:
# Check for missing values
print("Missing values in training data:")
print(train_data.isnull().sum())
print("Missing values in testing data:")
print(test_data.isnull().sum())

Missing values in training data:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64
Missing values in testing data:
PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Embarked        0
dtype: int64


In [9]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
train_data['Age'] = imputer.fit_transform(train_data[['Age']])
test_data['Age'] = imputer.fit_transform(test_data[['Age']])

In [10]:
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

In [11]:
# Check for missing values
print("Missing values in training data:")
print(train_data.isnull().sum())
print("Missing values in testing data:")
print(test_data.isnull().sum())

Missing values in training data:
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
Missing values in testing data:
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [12]:
# Encode categorical variables
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'], drop_first=True)

In [13]:
print(train_data.shape)
print(test_data.shape)

(891, 11)
(418, 11)


In [14]:
# Drop columns that are not useful for the model
train_data.drop(columns=['Name', 'Ticket'], inplace=True)
test_data.drop(columns=['Name', 'Ticket'], inplace=True)

In [15]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Age          891 non-null    float64
 3   SibSp        891 non-null    int64  
 4   Parch        891 non-null    int64  
 5   Fare         891 non-null    float64
 6   Sex_male     891 non-null    uint8  
 7   Embarked_Q   891 non-null    uint8  
 8   Embarked_S   891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 44.5 KB


In [16]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Age          418 non-null    float64
 3   SibSp        418 non-null    int64  
 4   Parch        418 non-null    int64  
 5   Fare         418 non-null    float64
 6   Sex_male     418 non-null    uint8  
 7   Embarked_Q   418 non-null    uint8  
 8   Embarked_S   418 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 20.9 KB


In [17]:
# Separate the target variable from the features
X_train = train_data.drop(columns=['Pclass'])
y_train = train_data['Pclass']

X_test = test_data.drop(columns=['Pclass'])
y_test = test_data['Pclass']


In [18]:
#Logistic regression and fit the model
classifier = LogisticRegression()
classifier.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
#Predict for X dataset
y_pred = classifier.predict(X_test)
y_pred

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 3, 1, 2, 3, 3, 3, 3, 1, 3,
       1, 1, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 1, 3, 2, 3, 2,
       1, 3, 1, 3, 1, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 1, 2,
       3, 1, 1, 1, 3, 3, 3, 1, 1, 1, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 2, 3, 1, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 2, 3, 3, 3, 3,
       3, 3, 1, 3, 1, 3, 3, 3, 1, 2, 2, 3, 1, 3, 3, 2, 3, 3, 3, 3, 3, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 1, 3, 2, 3, 1, 3, 3, 3,
       3, 3, 1, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 2,
       1, 1, 2, 1, 3, 1, 1, 3, 1, 3, 2, 3, 3, 3, 3, 2, 3, 3, 2, 3, 1, 3,
       3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 1, 3, 2, 3, 1, 3, 3, 1, 3, 1, 1, 3,
       3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 2, 3, 3, 1, 3, 1, 3, 3, 1, 1, 2,
       1, 3, 3, 1, 2, 3, 2, 3, 3, 3, 1, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 1, 3, 1, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 1, 3, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3,

In [20]:
len(y_pred)

418

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [22]:
print(f'Accuracy: {accuracy}')

Accuracy: 0.7751196172248804


In [23]:
print(f'Precision: {precision}')

Precision: 0.7588507050993052


In [24]:
print(f'Recall: {recall}')

Recall: 0.7751196172248804


In [25]:
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[ 88  14   5]
 [  4  24  65]
 [  2   4 212]]


In [26]:
print('Classification Report:')
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           1       0.94      0.82      0.88       107
           2       0.57      0.26      0.36        93
           3       0.75      0.97      0.85       218

    accuracy                           0.78       418
   macro avg       0.75      0.68      0.69       418
weighted avg       0.76      0.78      0.75       418



In [27]:
import numpy as np

# Coefficients of the logistic regression model
coefficients = classifier.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': coefficients
}).sort_values(by='Importance', ascending=False)

In [28]:
print('Feature Importance:')
print(feature_importance)

Feature Importance:
       Feature  Importance
4         Fare    0.121219
0  PassengerId   -0.000865
1          Age   -0.013486
6   Embarked_Q   -0.210927
5     Sex_male   -0.354858
7   Embarked_S   -0.674414
3        Parch   -0.957150
2        SibSp   -1.614695


# Deployment

In [29]:
#pip install streamlit

In [30]:
import streamlit as st

# Define a function for prediction
def predict_class(input_data):
    input_df = pd.DataFrame([input_data])
    prediction = classifier.predict(input_df)
    return prediction[0]

In [31]:
# Streamlit App
st.title('Titanic Passenger Class Prediction')

2024-06-09 14:23:55.727 
  command:

    streamlit run E:\Anaconda\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [32]:
# Input fields for the features
age = st.number_input('Age', min_value=0, max_value=100, value=25)
sib_sp = st.number_input('SibSp', min_value=0, max_value=10, value=0)
parch = st.number_input('Parch', min_value=0, max_value=10, value=0)
fare = st.number_input('Fare', min_value=0.0, max_value=1000.0, value=30.0)
sex_male = st.selectbox('Sex', ['male', 'female']) == 'male'
embarked_Q = st.selectbox('Embarked', ['Q', 'S', 'C']) == 'Q'
embarked_S = st.selectbox('Embarked', ['Q', 'S', 'C']) == 'S'

2024-06-09 14:23:57.105 Session state does not function when running a script without `streamlit run`


In [33]:
# Convert categorical inputs to numeric
sex_male = 1 if sex_male else 0
embarked_Q = 1 if embarked_Q else 0
embarked_S = 1 if embarked_S else 0

In [34]:
# Create a dictionary for the input data
input_data = {
    'Age': age,
    'SibSp': sib_sp,
    'Parch': parch,
    'Fare': fare,
    'Sex_male': sex_male,
    'Embarked_Q': embarked_Q,
    'Embarked_S': embarked_S
}

In [35]:
# Prediction button
if st.button('Predict Class'):
    prediction = predict_class(input_data)
    st.write(f'The predicted class is: {prediction}')