In [None]:
import streamlit as st
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


In [4]:

train_df = pd.read_csv("./datasets/titanic.csv")
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
def manipulate_df(df):
    # Update sex column to numerical
    df['Sex'] = df['Sex'].map(lambda x: 0 if x == 'male' else 1)
    # Fill the nan values in the age column
    df['Age'].fillna(value = df['Age'].mean() , inplace = True)
    # Create a first class column
    df['FirstClass'] = df['Pclass'].map(lambda x: 1 if x == 1 else 0)
    # Create a second class column
    df['SecondClass'] = df['Pclass'].map(lambda x: 1 if x == 2 else 0)
    # Create a second class column
    df['ThirdClass'] = df['Pclass'].map(lambda x: 1 if x == 3 else 0)
    # Select the desired features
    df= df[['Sex' , 'Age' , 'FirstClass', 'SecondClass' ,'ThirdClass' , 'Survived']]
    return df


manipulated_df = manipulate_df(train_df)
print(manipulated_df)

     Sex        Age  FirstClass  SecondClass  ThirdClass  Survived
0      0  22.000000           0            0           1         0
1      1  38.000000           1            0           0         1
2      1  26.000000           0            0           1         1
3      1  35.000000           1            0           0         1
4      0  35.000000           0            0           1         0
..   ...        ...         ...          ...         ...       ...
886    0  27.000000           0            1           0         0
887    1  19.000000           1            0           0         1
888    1  29.699118           0            0           1         0
889    0  26.000000           1            0           0         1
890    0  32.000000           0            0           1         0

[891 rows x 6 columns]


In [6]:
features= train_df[['Sex' , 'Age' , 'FirstClass', 'SecondClass','ThirdClass']]
survival = train_df['Survived']
X_train , X_test , y_train , y_test = train_test_split(features , survival ,test_size = 0.3)

In [7]:
scaler = StandardScaler()
train_features = scaler.fit_transform(X_train)
test_features = scaler.transform(X_test)
print(test_features)

[[-0.74145212  1.46343199 -0.58043938 -0.52347744  0.936219  ]
 [-0.74145212 -0.01127375 -0.58043938  1.910302   -1.06812615]
 [ 1.34870476  1.38702586  1.7228328  -0.52347744 -1.06812615]
 ...
 [-0.74145212 -0.67593966 -0.58043938 -0.52347744  0.936219  ]
 [ 1.34870476 -2.20406227 -0.58043938 -0.52347744  0.936219  ]
 [-0.74145212  0.16452777 -0.58043938 -0.52347744  0.936219  ]]


In [8]:
# Create and train the model
model = LogisticRegression()
model.fit(train_features , y_train)
train_score = model.score(train_features,y_train)
test_score = model.score(test_features,y_test)
y_predict = model.predict(test_features)
print("Training Score: ",train_score)
print("Testing Score: ",test_score)

Training Score:  0.8009630818619583
Testing Score:  0.7873134328358209
