## 安装依赖

In [1]:
# !pip install mlbox
# !pip install requests
# !pip install pandas
# !pip install mxnet
# !pip install matplotlib
# !pip install d2l

## 读取数据

In [2]:
import pandas as pd

train_data = pd.read_csv('./kaggle_Titanic/train_data.csv')
test_data = pd.read_csv('./kaggle_Titanic/test_data.csv')

In [3]:
print(train_data.shape, test_data.shape)

(891, 12) (418, 11)


In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 数据预处理

In [5]:
id_name = 'PassengerId'
label_name = 'Survived'

In [6]:
from mxnet import autograd, gluon, init, np, npx
from mxnet.gluon import nn
from d2l import mxnet as d2l

npx.set_np()

### 索引（非特征）

In [7]:
# 去除Id列
all_features = pd.concat((train_data.iloc[:, 1:], test_data.iloc[:, 1:]))
# 去除标签列
del all_features[label_name]
# 去除无用列
del all_features['Name']

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [8]:
all_features.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,Pclass,Sex,SibSp,Ticket
0,22.0,,S,7.25,0,3,male,1,A/5 21171
1,38.0,C85,C,71.2833,0,1,female,1,PC 17599
2,26.0,,S,7.925,0,3,female,0,STON/O2. 3101282
3,35.0,C123,S,53.1,0,1,female,1,113803
4,35.0,,S,8.05,0,3,male,0,373450


### 数字特征

In [9]:
# 对非类别变量（数字），重新缩放到零均值和单位方差来标准化数据
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 在标准化数据之后，所有数据都意味着消失，因此我们可以将缺失值设置为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [10]:
all_features.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,Pclass,Sex,SibSp,Ticket
0,-0.546789,,S,-0.50321,-0.444829,0.841595,male,0.481104,A/5 21171
1,0.563282,C85,C,0.733941,-0.444829,-1.545507,female,0.481104,PC 17599
2,-0.269271,,S,-0.490169,-0.444829,0.841595,female,-0.478904,STON/O2. 3101282
3,0.355144,C123,S,0.382632,-0.444829,-1.545507,female,0.481104,113803
4,0.355144,,S,-0.487754,-0.444829,0.841595,male,-0.478904,373450


### 离散特征

In [11]:
# `Dummy_na=True` 将“na”（缺失值）视为有效的特征值，并为其创建指示符特征。
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

(1309, 1129)

In [12]:
all_features.head()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,Cabin_A10,Cabin_A11,Cabin_A14,Cabin_A16,Cabin_A18,...,Ticket_W./C. 14260,Ticket_W./C. 14263,Ticket_W./C. 14266,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Ticket_nan
0,-0.546789,-0.50321,-0.444829,0.841595,0.481104,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.563282,0.733941,-0.444829,-1.545507,0.481104,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.269271,-0.490169,-0.444829,0.841595,-0.478904,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.355144,0.382632,-0.444829,-1.545507,0.481104,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.355144,-0.487754,-0.444829,0.841595,-0.478904,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 特征矩阵转换为张量

In [13]:
n_train = train_data.shape[0]
train_features = np.array(all_features[:n_train].values, dtype=d2l.float32)
test_features = np.array(all_features[n_train:].values, dtype=d2l.float32)
train_labels = np.array(train_data[label_name].values.reshape(-1, 1), dtype=d2l.float32)

In [14]:
print(train_features.shape, test_features.shape)
print(train_labels.shape)

(891, 1129) (418, 1129)
(891, 1)


In [15]:
train_csv = all_features[:n_train]
train_csv[label_name] = train_labels
train_csv.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(891, 1130)

In [16]:
test_csv = all_features[n_train:]
test_csv.shape

(418, 1129)

In [17]:
train_csv.to_csv('./tmp/train.csv', index=False)
test_csv.to_csv('./tmp/test.csv', index=False)

## 训练

In [18]:
from mlbox.preprocessing import *
from mlbox.optimisation import *
from mlbox.prediction import *

paths = ["./tmp/train.csv","./tmp/test.csv"]
target_name = "Survived"

rd = Reader(sep = ",")
df = rd.train_test_split(paths, target_name)

dft = Drift_thresholder()
df = dft.fit_transform(df)   #removing non-stable features (like ID,...)

opt = Optimiser(scoring = "accuracy", n_folds = 5)

space = {
    
        'est__strategy':{"search":"choice",
                                  "space":["LightGBM"]},    
        'est__n_estimators':{"search":"choice",
                                  "space":[150]},    
        'est__colsample_bytree':{"search":"uniform",
                                  "space":[0.8,0.95]},
        'est__subsample':{"search":"uniform",
                                  "space":[0.8,0.95]},
        'est__max_depth':{"search":"choice",
                                  "space":[5,6,7,8,9]},
        'est__learning_rate':{"search":"choice",
                                  "space":[0.07]} 
    
        }

params = opt.optimise(space, df,15)

prd = Predictor()
prd.fit_predict(params, df)


reading csv : train.csv ...
cleaning data ...
CPU time: 3.9145121574401855 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 1.964428186416626 seconds

> Number of common features : 1129

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 0
> Number of numerical features: 840
> Number of training samples : 876
> Number of test samples : 418

> You have no missing values on train set...

> Task : classification
(array(0.),)    540
(array(1.),)    336
Name: Survived, dtype: int64

encoding target ...

computing drifts ...
CPU time: 22.063166856765747 seconds

> Top 10 drifts

('Embarked_S', 0.0766697253719606)
('Embarked_C', 0.05566297437241907)
('Parch', 0.043908806886456375)
('Fare', 0.04357562648838775)
('SibSp', 0.0365296803652968)
('Cabin_nan', 0.03202355203076168)
('Age', 0.02457888182473611)
('Embarked_Q'

  +str(self.to_path)+"/joblib'. Please clear it regularly.")


MEAN SCORE : accuracy = 0.8253636363636364            
VARIANCE : 0.017316495664388587 (fold 1 = 0.8068181818181818, fold 2 = 0.8514285714285714, fold 3 = 0.8285714285714286, fold 4 = 0.8057142857142857, fold 5 = 0.8342857142857143)
CPU time: 0.9440979957580566 seconds                  
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'} 
>>> CA ENCODER :{'strategy': 'label_encoding'}                                   
>>> ESTIMATOR :{'strategy': 'LightGBM', 'colsample_bytree': 0.9185549708194477, 'learning_rate': 0.07, 'max_depth': 8, 'n_estimators': 150, 'subsample': 0.9159371241105856, 'boosting_type': 'gbdt', 'class_weight': None, 'importance_type': 'split', 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0

VARIANCE : 0.012869101089839822 (fold 1 = 0.8295454545454546, fold 2 = 0.8457142857142858, fold 3 = 0.8285714285714286, fold 4 = 0.8057142857142857, fold 5 = 0.8228571428571428)
CPU time: 0.756127119064331 seconds                                              
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'} 
>>> CA ENCODER :{'strategy': 'label_encoding'}                                   
>>> ESTIMATOR :{'strategy': 'LightGBM', 'colsample_bytree': 0.8190587147935668, 'learning_rate': 0.07, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9056771338392877, 'boosting_type': 'gbdt', 'class_weight': None, 'importance_type': 'split', 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent

VARIANCE : 0.01854188914033737 (fold 1 = 0.8181818181818182, fold 2 = 0.8571428571428571, fold 3 = 0.8285714285714286, fold 4 = 0.8, fold 5 = 0.8285714285714286)
CPU time: 0.7104301452636719 seconds                                              
100%|██████████| 15/15 [00:11<00:00,  1.31trial/s, best loss: -0.8344935064935065]


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BEST HYPER-PARAMETERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

{'est__colsample_bytree': 0.8439809667877305, 'est__learning_rate': 0.07, 'est__max_depth': 5, 'est__n_estimators': 150, 'est__strategy': 'LightGBM', 'est__subsample': 0.8185122257639762}

fitting the pipeline ...
CPU time: 0.15535640716552734 seconds

> 

<mlbox.prediction.predictor.Predictor at 0x21603f233c8>

In [19]:
submit = pd.read_csv("./kaggle_Titanic/gender_submission.csv",sep=',')
preds = pd.read_csv("save/"+target_name+"_predictions.csv")

submit[target_name] =  preds[target_name+"_predicted"].values

submit.to_csv("./tmp/mlbox.csv", index=False)