<a href="https://colab.research.google.com/github/sunlight2018/hands_on_ml3_notebooks/blob/main/notebooks/03_exe_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Tackle the Titanic dataset. A great place to start is on Kaggle. Alternatively, you can download the data from https://homl.info/titanic.tgz and unzip this tarball like you did for the housing data in Chapter 2. This will give you two CSV files, train.csv and test.csv, which you can load using pandas.read_csv(). The goal is to train a classifier that can predict the Survived column based on the other columns.

In [131]:

from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_titanic_data():
    tarball_path = Path("datasets/titanic.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/titanic.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as titanic_tarball:
            titanic_tarball.extractall(path="datasets")
    return [pd.read_csv(Path("datasets/titanic") / filename)
            for filename in ("train.csv", "test.csv")]

In [132]:
train_data, test_data = load_titanic_data()

In [133]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

#create num pipeline
num_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy= 'median')),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components= 0.95))
    ]
)

#create cat pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy= 'most_frequent')),
    ('cat_encoder', OneHotEncoder(handle_unknown= 'ignore', sparse_output= False))
])

#create new features to better use the data
def add_engineered_features(data_raw):
  df = data_raw.copy()

  #family size
  df["family_size"] = df["SibSp"] + df["Parch"]

  #Title from name
  df["Title"] = df["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)
  df["Title"] = df["Title"].replace(['Mlle', 'Ms'], 'Miss')
  df["Title"] = df["Title"].replace(['Mme'], 'Mrs')
  df["Title"] = df["Title"].replace(
        ['Dr', 'Rev', 'Col', 'Major', 'Jonkheer', 'Sir', 'Lady', 'Countess', 'Don', 'Dona', 'Capt'],
        'Rare'
    )

  #cabin first letter
  df["cabin_letter"] = df['Cabin'].fillna("U").str[0]

  #age bucket
  df["age_bucket"] = (df["Age"] // 15 * 15).fillna(-1)

  #IsAlone(binary)
  df["is_alone"]=(df["family_size"]== 0).astype(int)

  return df


#join the 2 pipeline
num_att = ['Fare', 'Age']
cat_att = ['Pclass', 'Sex', 'Embarked',
           'age_bucket', 'family_size',
           'Title', 'cabin_letter', 'is_alone']

preprocessed_pipeline = ColumnTransformer(
    [
        ('num', num_pipeline, num_att),
        ('cat', cat_pipeline, cat_att)
    ]
)

#drop useless columns
def drop_useless_columns(df):
  return df.drop(["Name", "SibSp", "Parch", "Cabin", "Ticket"], axis =1)


In [134]:
train_data = train_data.set_index('PassengerId')
#test_data = test_data.set_index('PassengerId')
train_data_feature = add_engineered_features(train_data)
train_data_final = drop_useless_columns(train_data_feature)

In [135]:
train_data_final.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,family_size,Title,cabin_letter,age_bucket,is_alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,male,22.0,7.25,S,1,Mr,U,15.0,0
2,1,1,female,38.0,71.2833,C,1,Mrs,C,30.0,0
3,1,3,female,26.0,7.925,S,0,Miss,U,15.0,1
4,1,1,female,35.0,53.1,S,1,Mrs,C,30.0,0
5,0,3,male,35.0,8.05,S,0,Mr,U,30.0,1


set index with id

In [136]:
train_data_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Sex           891 non-null    object 
 3   Age           714 non-null    float64
 4   Fare          891 non-null    float64
 5   Embarked      889 non-null    object 
 6   family_size   891 non-null    int64  
 7   Title         891 non-null    object 
 8   cabin_letter  891 non-null    object 
 9   age_bucket    891 non-null    float64
 10  is_alone      891 non-null    int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 83.5+ KB


In [137]:
train_data_final.describe()

Unnamed: 0,Survived,Pclass,Age,Fare,family_size,age_bucket,is_alone
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699113,32.204208,0.904602,18.454545,0.602694
std,0.486592,0.836071,14.526507,49.693429,1.613459,16.247002,0.489615
min,0.0,1.0,0.4167,0.0,0.0,-1.0,0.0
25%,0.0,2.0,20.125,7.9104,0.0,0.0,0.0
50%,0.0,3.0,28.0,14.4542,0.0,15.0,1.0
75%,1.0,3.0,38.0,31.0,1.0,30.0,1.0
max,1.0,3.0,80.0,512.3292,10.0,75.0,1.0


In [138]:
#preprocess x and get x
# x_train_preprocessd = preprocessed_pipeline.fit_transform(train_data_final)
# x_train_preprocessd

In [139]:
# forest_clf = RandomForestClassifier(n_estimators= 100, max_depth= 9, random_state = 42)
# forest_clf.fit(train_data_final, y_train)

In [140]:
# x_test = preprocessed_pipeline.fit_transform(test_data)
# predictions = forest_clf.predict(x_test)

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# 加工程特征和去掉无用列
train_data_featured = add_engineered_features(train_data)
train_data_final = drop_useless_columns(train_data_featured)

# 拆分 X 和 y
X = train_data_final.drop("Survived", axis=1)
y = train_data_final['Survived']

x_svc, x_target, y_svc, y_target = train_test_split(
    X, y,
    test_size= 0.2, random_state= 42)

full_pipeline = Pipeline([
    ('preprocessing', preprocessed_pipeline),
    ('svc', SVC())
])

param_grid_svc = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': ['scale', 0.001, 0.01, 0.1, 1, 'auto'],
    'svc__kernel': ['rbf']
}

grid_search_svc = GridSearchCV(full_pipeline, param_grid_svc, cv= 5, n_jobs= -1, verbose= 2)
grid_search_svc.fit(x_svc, y_svc)
print("best para: ", grid_search_svc.best_params_)
print(grid_search_svc.best_score_)
final_model = grid_search_svc.best_estimator_


Fitting 5 folds for each of 24 candidates, totalling 120 fits
best para:  {'svc__C': 10, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
0.8370333891460652


In [141]:
#preprocess test data
test_data_featured = add_engineered_features(test_data)
test_data_final = drop_useless_columns(test_data_featured)
test_data_final = test_data_final.drop("PassengerId", axis=1)

#predict
predictions = final_model.predict(test_data_final)

In [142]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
