In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X = pd.read_csv('/content/drive/MyDrive/MISSP/train.csv', index_col=0)
print(X.head())
y = X.Survived
print(y.head())
X.drop('Survived', axis='columns', inplace=True)
print(X.head())
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1, train_size=0.75, test_size=0.15)

             Survived  Pclass  ... Cabin Embarked
PassengerId                    ...               
1                   0       3  ...   NaN        S
2                   1       1  ...   C85        C
3                   1       3  ...   NaN        S
4                   1       1  ...  C123        S
5                   0       3  ...   NaN        S

[5 rows x 11 columns]
PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64
             Pclass  ... Embarked
PassengerId          ...         
1                 3  ...        S
2                 1  ...        C
3                 3  ...        S
4                 1  ...        S
5                 3  ...        S

[5 rows x 10 columns]


In [3]:
# Uzupełnienie brakującego wieku przeciętnym (medianowo)
def fill_age(age_col):
    return age_col.fillna(age_col.median())
# Przyznanie numerów płciom
def enum_sex(sex_col):
    return sex_col == 'female'
# Wybór kolumn z liczbami
def select_num(pd_frame):
    num_cols = []
    for col in pd_frame.columns:
        if pd_frame[col].dtype==int or pd_frame[col].dtype==float:
            num_cols.append(col)
    return pd_frame[num_cols]
# Funkcja przetwarzająca dane
def get_enumed_data(data):
    age_col = fill_age(data.Age)
    sex_col = enum_sex(data.Sex)
    num_data = select_num(data.drop(['Age', 'Sex'], axis='columns'))
    for col in num_data.columns:
        num_data[col].fillna(num_data[col].median(), inplace=True)
    return pd.concat([num_data, age_col, sex_col], axis='columns')
default_sector = 'A'
sector_enum = {'nieznana': 9, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
# Funkcja przetwarzająca dane
def get_enumed_data_with_sector(data):
    return pd.concat([get_enumed_data(data), data.Cabin.map(cabin_to_sector)], axis='columns')
# Wiele kabin do jednego biletu - pominięte
# Liczona jest tylko pierwsza litera
def cabin_to_sector(name):
    if type(name)==str and name:
        sector = sector_enum[name[0]]
    else:
        sector = sector_enum[default_sector]
    return sector

In [4]:
# Pominięcie innych typów danych
X_train_num = get_enumed_data(X_train)
X_valid_num = get_enumed_data(X_valid)
forest = RandomForestRegressor(random_state=1)
forest.fit(X_train_num, y_train)
y_predict = forest.predict(X_valid_num) >= 0.5
error = (y_predict == y_valid).sum() / y_valid.count()
print(error)

0.7761194029850746


In [5]:
for sect in sector_enum.keys():
    default_sector = sect
    # Pominięcie innych typów danych
    X_train_num = get_enumed_data_with_sector(X_train)
    X_valid_num = get_enumed_data_with_sector(X_valid)
    forest = RandomForestRegressor(random_state=1)
    forest.fit(X_train_num, y_train)
    y_predict = forest.predict(X_valid_num) >= 0.5
    error = (y_predict == y_valid).sum() / y_valid.count()
    print(sect, error)
# Dla każdej wartości wynik jest pogorszony. Jest to tylko część danych z tej kolumny.
# Dodatkowo, wiele komórek jest pustych.
# Na podstawie poniższych wyników ta metoda analizy zostaje wykluczona.

nieznana 0.753731343283582
A 0.753731343283582
B 0.7686567164179104
C 0.7686567164179104
D 0.7686567164179104
E 0.7761194029850746
F 0.753731343283582
G 0.746268656716418
T 0.753731343283582


In [8]:
X_test = pd.read_csv('/content/drive/MyDrive/MISSP/test.csv', index_col=0)
# Pominięcie innych typów danych
X = get_enumed_data(X)
X_test = get_enumed_data(X_test)
forest = RandomForestRegressor(random_state=1)
forest.fit(X, y)
print(X_test.notna().sum())
y_file = pd.Series(forest.predict(X_test) >= 0.5, index=X_test.index, name='Survived')
y_file = y_file.astype(int, False)
y_file.to_csv('/content/drive/MyDrive/MISSP/result.csv')
y_file.head()

Pclass    418
SibSp     418
Parch     418
Fare      418
Age       418
Sex       418
dtype: int64


PassengerId
892    0
893    0
894    1
895    1
896    1
Name: Survived, dtype: int64

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
