In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the data
df = pd.read_csv('tested.csv')

# Explore initial data
print(df.head())
print(df.info())
print(df.isnull().sum())

   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (

In [35]:
print("Columns in DataFrame:", df.columns.tolist())

columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
columns_to_drop = [col for col in columns_to_drop if col in df.columns]

if columns_to_drop: 
    df.drop(columns_to_drop, axis=1, inplace=True)

Columns in DataFrame: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [37]:
df = df.assign(
    Age=df['Age'].fillna(df['Age'].median()),
    Fare=df['Fare'].fillna(df['Fare'].median()),
    Embarked=df['Embarked'].fillna(df['Embarked'].mode()[0])
)

print("After handling missing values:")
print(df[['Age', 'Fare', 'Embarked']].head())

df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

print("\nAfter feature engineering:")
print(df[['FamilySize', 'IsAlone']].head())

label_encoders = {}
for col in ['Sex', 'Embarked']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print("\nAfter encoding categorical variables:")
print(df[['Sex', 'Embarked']].head())

df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                      labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])
df['AgeGroup'] = LabelEncoder().fit_transform(df['AgeGroup'])

print("\nAfter creating age groups:")
print(df[['Age', 'AgeGroup']].head())

df['FareGroup'] = pd.qcut(df['Fare'], 4, labels=[0, 1, 2, 3])

print("\nAfter creating fare groups:")
print(df[['Fare', 'FareGroup']].head())

print("\nFinal DataFrame info:")
df.info()

After handling missing values:
    Age     Fare Embarked
0  34.5   7.8292        Q
1  47.0   7.0000        S
2  62.0   9.6875        Q
3  27.0   8.6625        S
4  22.0  12.2875        S

After feature engineering:
   FamilySize  IsAlone
0           1        1
1           2        0
2           1        1
3           1        1
4           3        0

After encoding categorical variables:
   Sex  Embarked
0    1         1
1    0         2
2    1         1
3    1         2
4    0         2

After creating age groups:
    Age  AgeGroup
0  34.5         4
1  47.0         0
2  62.0         2
3  27.0         4
4  22.0         4

After creating fare groups:
      Fare FareGroup
0   7.8292         0
1   7.0000         0
2   9.6875         1
3   8.6625         1
4  12.2875         1

Final DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0 

In [39]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 
            'FamilySize', 'IsAlone', 'AgeGroup', 'FareGroup']
X = df[features]
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("\nFirst 5 rows of training set:")
print(X_train.head())

scaler = StandardScaler()
num_cols = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("\nAfter scaling - first 5 rows of training set:")
print(X_train.head())

Training set shape: (334, 11)
Test set shape: (84, 11)

First 5 rows of training set:
     Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  FamilySize  IsAlone  \
336       2    1  32.0      0      0  13.0000         2           1        1   
31        2    1  24.0      2      0  31.5000         2           3        0   
84        2    1  27.0      0      0  10.7083         1           1        1   
287       1    1  24.0      1      0  82.2667         2           2        0   
317       2    1  19.0      0      0  10.5000         2           1        1   

     AgeGroup FareGroup  
336         4         1  
31          4         3  
84          4         1  
287         4         3  
317         4         1  

After scaling - first 5 rows of training set:
     Pclass  Sex       Age     SibSp     Parch      Fare  Embarked  \
336       2    1  0.196324 -0.498312 -0.431963 -0.411823         2   
31        2    1 -0.418174  1.621895 -0.431963 -0.092271         2   
84        2    1 -0.

In [43]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [45]:
importances = model.feature_importances_
feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance)


Feature Importances:
       Feature  Importance
1          Sex    0.812931
5         Fare    0.046856
2          Age    0.036645
10   FareGroup    0.021303
7   FamilySize    0.019238
6     Embarked    0.015761
4        Parch    0.011694
3        SibSp    0.010818
8      IsAlone    0.010037
9     AgeGroup    0.009081
0       Pclass    0.005634
