In [4]:
import pandas as pd

file_path = '/kaggle/input/titanic/Titanic.csv'

# загружаем датасет
df = pd.read_csv(file_path)

# выводим информацию о датасете
df.info()

# и 5 первых строчек
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# проверим количество пропущенных значений
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:

# обработаем пропущенные значения
# заполним пропущенные значения 'Age' медианным значением
df['Age'].fillna(df['Age'].median(), inplace=True)

# сбросим столбец 'Cabin', т.к. в нем слишком много пропусков
df.drop(columns='Cabin', inplace=True)

# заполним пропущенные 'Embarked' самым частым значением
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# конвертируем категориальные значения 'Sex' и 'Embarked' в числовые
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})  # М -> 0, Ж -> 1
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# сбросим колонки которые не влияют на предсказание
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

# еще раз выведем часть подготовленного датасета
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,False,True
1,1,1,1,38.0,1,0,71.2833,False,False
2,1,3,1,26.0,0,0,7.925,False,True
3,1,1,1,35.0,1,0,53.1,False,True
4,0,3,0,35.0,0,0,8.05,False,True


In [7]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Survived'])
y = df['Survived']

# разделим датасет 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# выведем краткую информацию про полученные наборы
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((712, 8), (179, 8), (712,), (179,))

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# инициализируем модель Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# обучим модель
rf_model.fit(X_train, y_train)

# сделаем предсказания
y_pred = rf_model.predict(X_test)

# посчитаем точность
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.7988826815642458

In [9]:
# построим матрицу несоответствий 
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

print(f"Model Accuracy: {accuracy * 100:.2f}%")


Confusion Matrix:
 [[88 17]
 [19 55]]
Model Accuracy: 79.89%


In [11]:
import joblib
model_filename = '/kaggle/working/titanic_random_forest_model.pkl'
joblib.dump(rf_model, model_filename)

['/kaggle/working/titanic_random_forest_model.pkl']