In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
train_df= pd.read_csv(r'C:\Users\yossr\Documents\MlOps proj\data\raw\train.csv')
test_df = pd.read_csv(r'C:\Users\yossr\Documents\MlOps proj\data\raw\test.csv')

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
X_train = train_df.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
y_train = train_df['Survived']
X_test = test_df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

In [5]:
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

In [6]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

In [7]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=10, random_state=42))
])


In [10]:
model.fit(X_train, y_train)

In [11]:
predictions = model.predict(X_test)

In [12]:
print("\nPredictions for test data:")
test_predictions = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Predicted_Survival': predictions
})
print(test_predictions)



Predictions for test data:
     PassengerId  Predicted_Survival
0            892                   0
1            893                   0
2            894                   0
3            895                   1
4            896                   1
..           ...                 ...
413         1305                   0
414         1306                   1
415         1307                   0
416         1308                   0
417         1309                   1

[418 rows x 2 columns]
