## Basic workflow

In [None]:
import pandas as pd

# path of the data to read
path = 'train.csv'

data = pd.read_csv(path)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
y = data.target
X = data.drop(['target'], axis=1)
X = X.drop(['id'], axis=1)

In [None]:
del data

In [None]:
from sklearn.feature_selection import VarianceThreshold
# 使用VarianceThreshold来过滤掉方差在0.01以下的特征
selector = VarianceThreshold(threshold=0.01)
X = selector.fit_transform(X)

In [None]:
x_col = X.columns[X.var()<=0.01]  # 提取方差小于等于的字段
x_var = X.loc[:,[i for i in X.columns if i not in x_col]] # 列表表达式过滤方差为0.01的字段

In [None]:
X.var()<=0.01

In [None]:
selected_feature = x_var.columns.tolist()

In [None]:
len(x_var.columns.tolist())

In [None]:
from sklearn.model_selection import train_test_split
# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(x_var, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:
del X
del y

In [None]:
print(type(X_train_full))

In [None]:
X_train_full = pd.DataFrame(X_train_full)
X_valid_full = pd.DataFrame(X_valid_full)
y_train = pd.DataFrame(y_train)
y_valid = pd.DataFrame(y_valid)

In [None]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [None]:
X_train.head()

### Define preprocesser

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

### Define model
use XGBoost here

In [None]:
from xgboost import XGBRegressor
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

In [None]:
from sklearn.metrics import roc_auc_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', my_model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = roc_auc_score(y_valid, preds)
print('AUC:', score)

In [None]:
del preds
del X_train
del y_train

In [None]:
import joblib
#save model
joblib.dump(my_pipeline, 'my_pipeline.pkl')

In [None]:
import joblib
#save model
joblib.dump(my_model, 'my_model.pkl')

In [105]:
path = 'test.csv'
test_data = pd.read_csv(path)

In [None]:
test_data_selected = test_data[selected_feature]

In [None]:
test_data_selected.head

In [None]:
test_data = pd.DataFrame(test_data)

In [None]:
del test_data

In [None]:
X_train.head

In [None]:
order = X_train.columns.tolist()

In [96]:
test_data_selected = test_data_selected[order]

In [98]:
test_data_selected.columns

Index(['f1', 'f2', 'f4', 'f6', 'f8', 'f9', 'f10', 'f22', 'f23', 'f32',
       ...
       'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283',
       'f284'],
      dtype='object', length=129)

In [102]:
my_model.get_booster().feature_names

['f0',
 'f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f20',
 'f21',
 'f22',
 'f23',
 'f24',
 'f25',
 'f26',
 'f27',
 'f28',
 'f29',
 'f30',
 'f31',
 'f32',
 'f33',
 'f34',
 'f35',
 'f36',
 'f37',
 'f38',
 'f39',
 'f40',
 'f41',
 'f42',
 'f43',
 'f44',
 'f45',
 'f46',
 'f47',
 'f48',
 'f49',
 'f50',
 'f51',
 'f52',
 'f53',
 'f54',
 'f55',
 'f56',
 'f57',
 'f58',
 'f59',
 'f60',
 'f61',
 'f62',
 'f63',
 'f64',
 'f65',
 'f66',
 'f67',
 'f68',
 'f69',
 'f70',
 'f71',
 'f72',
 'f73',
 'f74',
 'f75',
 'f76',
 'f77',
 'f78',
 'f79',
 'f80',
 'f81',
 'f82',
 'f83',
 'f84',
 'f85',
 'f86',
 'f87',
 'f88',
 'f89',
 'f90',
 'f91',
 'f92',
 'f93',
 'f94',
 'f95',
 'f96',
 'f97',
 'f98',
 'f99',
 'f100',
 'f101',
 'f102',
 'f103',
 'f104',
 'f105',
 'f106',
 'f107',
 'f108',
 'f109',
 'f110',
 'f111',
 'f112',
 'f113',
 'f114',
 'f115',
 'f116',
 'f117',
 'f118',
 'f119',
 'f120',
 'f121',
 'f122',
 'f

In [103]:
test_preds = my_model.predict(test_data_selected.values,validate_features=False)

In [107]:
type(test_preds)

numpy.ndarray

In [108]:
type(test_data)

pandas.core.frame.DataFrame

In [109]:
test_data_id = test_data.values


In [113]:
test_data.head

<bound method NDFrame.head of              id        f0        f1        f2        f3        f4        f5  \
0       1000000  0.178216  0.435617  0.010230  0.202074  0.390170  0.324221   
1       1000001  0.181250  0.476455  0.022413  0.283146  0.598020  0.349508   
2       1000002  0.159721  0.451202  0.259649  0.365274  0.594634  0.413502   
3       1000003  0.182424  0.520976  0.095344  0.327742  0.741830  0.358711   
4       1000004  0.229329  0.336513  0.023511  0.300913  0.668738  0.481586   
...         ...       ...       ...       ...       ...       ...       ...   
499995  1499995  0.185473  0.525338  0.014945  0.204029  0.498957  0.431933   
499996  1499996  0.183194  0.553266  0.008055  0.280651  0.636591  0.478092   
499997  1499997  0.184650  0.533643  0.011218  0.201262  0.720698  0.472888   
499998  1499998  0.227731  0.513247  0.178603  0.313778  0.665656  0.401365   
499999  1499999  0.185561  0.406225  0.097598  0.327129  0.655168  0.384097   

              f6     

In [114]:


output = pd.DataFrame({'Id': test_data.id,
                       'target': test_preds})
output.to_csv('submission.csv', index=False)

发现这题是二分类。。。之前的输出都是连续值。。。

In [1]:
import pandas as pd
path = 'submission.csv'
preds_data = pd.read_csv(path)

In [2]:
preds_data

Unnamed: 0,id,target
0,1000000,0.673872
1,1000001,0.370516
2,1000002,0.873733
3,1000003,0.974947
4,1000004,0.172119
...,...,...
499995,1499995,1.090563
499996,1499996,0.812192
499997,1499997,0.423530
499998,1499998,0.837297


In [6]:
target = preds_data.target.tolist()

In [7]:
target

[0.6738716,
 0.37051573,
 1.0,
 1.0,
 0.17211878,
 0.15579236,
 -0.023308465,
 0.30613187,
 1.0,
 0.70023745,
 0.7625517,
 0.48157328,
 0.29143247,
 0.41463357,
 0.40566832,
 0.26467663,
 0.34873998,
 0.43680778,
 0.19884855,
 1.0,
 0.13086101,
 0.14126964,
 0.7659106,
 0.25051492,
 0.12294753,
 0.6832532,
 0.32243252,
 0.14244536,
 1.0,
 1.0,
 0.48428765,
 0.1859971,
 0.3282411,
 0.77299947,
 0.7879226,
 0.31318697,
 0.79546857,
 0.14536591,
 0.68908143,
 0.554945,
 0.6198994,
 0.31100184,
 0.7857468,
 1.0,
 0.32356927,
 0.30479804,
 0.2837769,
 1.0,
 0.78299606,
 0.27590021,
 1.0,
 0.5213563,
 0.06460572,
 0.3842411,
 0.27913317,
 1.0,
 0.7206481,
 0.20037839,
 0.45614654,
 1.0,
 0.39267775,
 1.0,
 0.40924856,
 1.0,
 0.1774267,
 1.0,
 1.0,
 0.79108757,
 0.08631733,
 0.26756135,
 0.718564,
 0.07431234,
 0.57877576,
 0.5657011,
 0.29985842,
 0.28631288,
 0.37831193,
 0.5830167,
 0.2622846,
 1.0,
 1.0,
 0.29107237,
 0.67498934,
 0.23422182,
 0.15620619,
 0.08852683,
 0.19930477,
 0.3175

In [13]:
for i in range(len(target)):
    if target[i] > 0.75:
        target[i] = 1
    elif target[i] < 0.3:
        target[i] = 0
            
            

In [14]:
target

[0.6738716,
 0.37051573,
 1,
 1,
 0,
 0,
 0,
 0.30613187,
 1,
 0.70023745,
 1,
 0.48157328,
 0,
 0.41463357,
 0.40566832,
 0,
 0.34873998,
 0.43680778,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0.6832532,
 0.32243252,
 0,
 1,
 1,
 0.48428765,
 0,
 0.3282411,
 1,
 1,
 0.31318697,
 1,
 0,
 0.68908143,
 0.554945,
 0.6198994,
 0.31100184,
 1,
 1,
 0.32356927,
 0.30479804,
 0,
 1,
 1,
 0,
 1,
 0.5213563,
 0,
 0.3842411,
 0,
 1,
 0.7206481,
 0,
 0.45614654,
 1,
 0.39267775,
 1,
 0.40924856,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0.718564,
 0,
 0.57877576,
 0.5657011,
 0,
 0,
 0.37831193,
 0.5830167,
 0,
 1,
 1,
 0,
 0.67498934,
 0,
 0,
 0,
 0,
 0.31757867,
 0,
 1,
 0.72639936,
 0.5692356,
 0.3289419,
 0.7105276,
 0,
 0,
 0.6261464,
 0.5451734,
 0.34023497,
 1,
 0,
 1,
 0,
 0.5202022,
 1,
 1,
 0.37083265,
 0.3609733,
 0,
 0.55433273,
 0.70581174,
 1,
 0.35199937,
 0.60602546,
 1,
 0,
 0,
 0,
 0,
 0.45585093,
 0.44276765,
 0.3593418,
 0.62284636,
 1,
 0,
 0,
 1,
 0.5945578,
 0,
 0.5330788,
 1,
 0,
 0,
 0.3447785,
 0

In [15]:
preds_data.target = target

In [16]:
type(preds_data)

pandas.core.frame.DataFrame

In [17]:
preds_data

Unnamed: 0,id,target
0,1000000,0.673872
1,1000001,0.370516
2,1000002,1.000000
3,1000003,1.000000
4,1000004,0.000000
...,...,...
499995,1499995,1.000000
499996,1499996,1.000000
499997,1499997,0.423530
499998,1499998,1.000000


In [18]:
output = preds_data
output.to_csv('submission.csv', index=False)

## Feature engineering using sweetviz(Auto EDA)

In [2]:
import pandas as pd
import sweetviz

In [26]:
train = pd.read_csv("train.csv")
# test = pd.read_csv("test.csv")

In [14]:
del test

In [4]:
#check the 1st 5 rows
train.head()
#check the last 5 rows
train.tail()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
999995,999995,0.204312,0.344754,0.262267,0.228333,0.610727,0.357463,0.490586,0.613655,0.509203,...,0,0,0,1,0,0,1,0,0,1
999996,999996,0.182004,0.564019,0.242564,0.241178,0.453623,0.469513,0.477518,0.659226,0.519219,...,0,0,0,0,0,0,0,0,1,0
999997,999997,0.250304,0.491553,0.098547,0.235656,0.771272,0.368018,0.531642,0.598111,0.618474,...,0,0,0,0,0,0,0,0,0,0
999998,999998,0.203572,0.534923,0.180118,0.213109,0.654544,0.535152,0.316271,0.652522,0.398026,...,0,0,0,0,0,0,0,0,0,1
999999,999999,0.160972,0.596308,0.013061,0.280355,0.579849,0.401904,0.49387,0.611947,0.53137,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#sv可视化
#设置跳过passengerID
feature_config = sweetviz.FeatureConfig(skip="id")
sv_report = sweetviz.analyze(train ,target_feat = 'target',feat_cfg = feature_config,pairwise_analysis='off')
sv_report.show_html()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=287.0), HTML(value='')), …


Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [9]:
type(train)

pandas.core.frame.DataFrame

In [12]:
train = train.drop(['id'], axis=1)

In [11]:
y = train.target
train.drop(['target'], axis=1)

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
0,0,0.205979,0.410993,0.176775,0.223581,0.423543,0.476140,0.413590,0.612021,0.534873,...,0,0,1,0,0,0,0,0,0,0
1,1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,...,1,0,1,0,0,0,0,0,0,0
2,2,0.182583,0.307431,0.325950,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,...,0,0,0,0,1,1,0,0,0,0
3,3,0.180240,0.494592,0.008367,0.223580,0.760618,0.439211,0.432055,0.776147,0.483958,...,1,0,0,0,0,1,0,0,0,0
4,4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,...,0,0,1,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999995,0.204312,0.344754,0.262267,0.228333,0.610727,0.357463,0.490586,0.613655,0.509203,...,0,0,0,0,1,0,0,1,0,0
999996,999996,0.182004,0.564019,0.242564,0.241178,0.453623,0.469513,0.477518,0.659226,0.519219,...,0,0,0,0,0,0,0,0,0,1
999997,999997,0.250304,0.491553,0.098547,0.235656,0.771272,0.368018,0.531642,0.598111,0.618474,...,0,0,0,0,0,0,0,0,0,0
999998,999998,0.203572,0.534923,0.180118,0.213109,0.654544,0.535152,0.316271,0.652522,0.398026,...,1,0,0,0,0,0,0,0,0,0


PCA数据降维

In [13]:
import sklearn.decomposition as sk_decomposition
pca = sk_decomposition.PCA(n_components='mle',whiten=False,svd_solver='auto')
pca.fit(train)
train = pca.transform(train)
#reduced_X为降维后的数据
print('PCA:')
print ('降维后的各主成分的方差值占总方差值的比例',pca.explained_variance_ratio_)
print ('降维后的各主成分的方差值',pca.explained_variance_)
print ('降维后的特征数',pca.n_components_)

PCA:
降维后的各主成分的方差值占总方差值的比例 [3.12162638e-02 2.07585666e-02 2.06857222e-02 2.06328849e-02
 2.05178203e-02 2.03945490e-02 2.02966810e-02 2.02508922e-02
 2.01819524e-02 1.99580523e-02 1.99487861e-02 1.98884221e-02
 1.96754767e-02 1.96018686e-02 1.88690371e-02 1.86974278e-02
 1.86355549e-02 1.84920207e-02 1.83032760e-02 1.79484022e-02
 1.76947210e-02 1.73850792e-02 1.67215751e-02 1.66260445e-02
 1.65357515e-02 1.61253144e-02 1.59626188e-02 1.57443645e-02
 1.54635904e-02 1.52553853e-02 1.48765635e-02 1.38426561e-02
 1.33849134e-02 1.31706296e-02 1.29945502e-02 1.26726813e-02
 1.25438072e-02 1.23684682e-02 1.18534308e-02 1.16006929e-02
 1.08670390e-02 1.07904608e-02 1.02331788e-02 9.98005816e-03
 9.84944456e-03 9.74422636e-03 9.36141546e-03 7.51028552e-03
 6.98030657e-03 6.40857458e-03 6.34364010e-03 6.06583832e-03
 5.73866136e-03 5.70954594e-03 4.84772294e-03 4.53951065e-03
 4.52924159e-03 4.02331344e-03 3.94492130e-03 3.10835604e-03
 2.87841157e-03 2.45529242e-03 2.43718880e-03 2.36769838e-0

In [19]:
feature_selected = ['f22', 'f179', 'f69', 'f58', 'f78', 'f44', 'f138', 'f139', 'f144', 'f146', 'f157', 'f158', 'f160']
feature_selected.sort()

In [33]:
print(feature_selected)
type(feature_selected)

['f138', 'f139', 'f144', 'f146', 'f157', 'f158', 'f160', 'f179', 'f22', 'f44', 'f58', 'f69', 'f78']


list

In [34]:
X_train = train[feature_selected]

In [35]:
X_train.head()

Unnamed: 0,f138,f139,f144,f146,f157,f158,f160,f179,f22,f44,f58,f69,f78
0,0.024197,0.044097,0.406843,0.799434,0.595877,0.542969,0.020373,0.112764,1,0.19343,0.075109,0.199588,0.198157
1,0.861636,0.53655,0.034981,0.772612,0.041795,0.63295,0.013255,0.008115,1,0.821982,0.241071,0.228739,0.23261
2,0.029521,0.573076,0.738622,0.749761,0.68611,0.497687,0.02834,0.011306,0,0.162094,0.078051,0.164643,0.261242
3,0.731664,0.651212,0.692899,0.132668,0.602061,0.023516,0.699298,0.012912,1,0.834834,0.177084,0.179141,0.17846
4,0.019616,0.541864,0.02739,0.798959,0.56418,0.023965,0.010334,0.188779,0,0.844187,0.085606,0.149717,0.189286


In [36]:
del train

In [37]:
from sklearn.model_selection import train_test_split
# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_train, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [38]:
X_train_full = pd.DataFrame(X_train_full)
X_valid_full = pd.DataFrame(X_valid_full)
y_train = pd.DataFrame(y_train)
y_valid = pd.DataFrame(y_valid)

In [39]:
from xgboost import XGBRegressor
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)

In [40]:
my_model.fit(X_train_full, y_train,
             early_stopping_rounds=5, 
             eval_set=[(X_valid_full, y_valid)], 
             verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
from sklearn.metrics import roc_auc_score
# Preprocessing of validation data, get predictions
preds = my_model.predict(X_valid_full)

# Evaluate the model
score = roc_auc_score(y_valid, preds)
print('AUC:', score)

AUC: 0.8223778163762234


In [43]:
del X_train_full
del X_valid_full
del y_train
del y_valid

In [52]:
path = 'test.csv'
test_data = pd.read_csv(path)

In [45]:
import joblib
#save model
joblib.dump(my_model, 'my_model.pkl')

['my_model.pkl']

In [46]:
test_data_selected = test_data[feature_selected]

In [47]:
test_data_selected.head

<bound method NDFrame.head of             f138      f139      f144      f146      f157      f158      f160  \
0       0.855089  0.533881  0.022663  0.747389  0.548862  0.009272  0.022934   
1       0.042258  0.661955  0.700828  0.662376  0.035432  0.012739  0.008219   
2       0.982384  0.687878  0.543080  0.773109  0.548459  0.021391  0.016886   
3       0.931131  0.560195  0.694926  0.501801  0.501765  0.026940  0.025861   
4       0.748340  0.649976  0.024980  0.625432  0.549611  0.010037  0.017138   
...          ...       ...       ...       ...       ...       ...       ...   
499995  0.872092  0.550266  0.727438  0.614350  0.699455  0.026255  0.845976   
499996  0.025690  0.506743  0.773885  0.601497  0.055352  0.020191  0.012396   
499997  0.034115  0.650110  0.789440  0.479053  0.666943  0.015226  0.013166   
499998  0.021315  0.641473  0.298837  0.630786  0.025128  0.030213  0.022161   
499999  0.833447  0.612956  0.801556  0.132130  0.608079  0.652875  0.007127   

         

In [49]:
test_preds = my_model.predict(test_data_selected)

In [50]:
test_preds

array([0.8701999 , 0.2685362 , 0.8720852 , ..., 0.37698418, 0.68977726,
       0.34703073], dtype=float32)

In [53]:
output = pd.DataFrame({'id': test_data.id,
                       'target': test_preds})
output.to_csv('submission.csv', index=False)

### 下一步就是对XGBoost进行调参