<h1>Standardizing Without Pipeline</h1>

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("titanic.csv")
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
data = data[["Age", "Fare", "Survived"]]
data1 = data[["Age", "Fare", "Survived"]]
data.head(3)

Unnamed: 0,Age,Fare,Survived
0,22.0,7.25,0
1,38.0,71.2833,1
2,26.0,7.925,1


In [4]:
data.describe()

Unnamed: 0,Age,Fare,Survived
count,714.0,891.0,891.0
mean,29.699118,32.204208,0.383838
std,14.526497,49.693429,0.486592
min,0.42,0.0,0.0
25%,20.125,7.9104,0.0
50%,28.0,14.4542,0.0
75%,38.0,31.0,1.0
max,80.0,512.3292,1.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Fare      891 non-null    float64
 2   Survived  891 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 21.0 KB


In [6]:
data.count()

Age         714
Fare        891
Survived    891
dtype: int64

In [7]:
data.isnull().sum()

Age         177
Fare          0
Survived      0
dtype: int64

<h1>Filling Missing Values</h1>

In [8]:
data["Age"] = data["Age"].fillna(data["Age"].mean())

In [9]:
data.isnull().sum()

Age         0
Fare        0
Survived    0
dtype: int64

In [10]:
data.head(5)

Unnamed: 0,Age,Fare,Survived
0,22.0,7.25,0
1,38.0,71.2833,1
2,26.0,7.925,1
3,35.0,53.1,1
4,35.0,8.05,0


<h1>Spliting Features</h1>

In [11]:
x = data[["Age", "Fare"]]
y = data["Survived"]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = None)

<h1>Standardizing/Scaling_features</h1>

In [13]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

<h1>Training Model</h1>

In [14]:
model = LogisticRegression()

model.fit(x_train_scaled, y_train)

y_pred = model.predict(x_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.65


<h1>Training Model Removing Empty Rows Insted Of Filling Values With Mean()</h1>

In [15]:
data1.head()

Unnamed: 0,Age,Fare,Survived
0,22.0,7.25,0
1,38.0,71.2833,1
2,26.0,7.925,1
3,35.0,53.1,1
4,35.0,8.05,0


In [16]:
data1.isnull().sum()

Age         177
Fare          0
Survived      0
dtype: int64

In [17]:
# Drop rows with any NaN value in specific columns (e.g., 'Age', 'Fare', 'Survived')
data1 = data1.dropna(subset=['Age', 'Fare', 'Survived'])

# Or to drop rows with any NaN value in the whole DataFrame:
##data1 = df.dropna()

In [18]:
data1.isnull().sum()

Age         0
Fare        0
Survived    0
dtype: int64

In [19]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Fare      714 non-null    float64
 2   Survived  714 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 22.3 KB


In [20]:
x = data1[["Age", "Fare"]]
y = data1["Survived"]

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=None)

In [22]:
# Scaling Features:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [23]:
# Training the model:
model = LogisticRegression()
model.fit(x_train_scaled, y_train)
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy:.2f}")

accuracy: 0.67


<h1>Saving The Trained Model</h1>

In [24]:
import joblib

In [25]:
joblib.dump(model, "logistic_regression_titanic_model2")

['logistic_regression_titanic_model2']

<h1>Using A Saved Model</h1>

In [28]:
loaded_model = joblib.load("logistic_regression_titanic_model2.pkl")

<h1>Getting Data From User To Predict</h1>

In [43]:
age = float(input("Enter Age: "))
fare = float(input("Enter Fare: "))

new_data = pd.DataFrame([[age, fare]], columns = ["Age", "Fare"])

new_data_scaled = scaler.transform(new_data)

prediction = loaded_model.predict(new_data_scaled)

print(f"Prediction: Will Be Found {"Alive" if prediction[0] == 1 else "Dead"}.")

Enter Age:  65
Enter Fare:  87


Prediction: Will Be Found Dead.
