In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import math
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# machine learning
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
path = 'data/allPredictFill.csv'
data = pd.read_csv(path, encoding = 'utf8')
data.head(5)

Unnamed: 0.1,Unnamed: 0,ssy,szy,xqdmddb,xqgmddb,xqgysz,100005,100007,31,315,316,317,319,33,34,37,39
0,0,165.0,100.0,2.08,1.29,3.24,12.9,0.26,4.82,85.4,28.2,330.0,255.0,2.0,0.2,65.6,2.6
1,1,141.0,97.0,2.64,1.36,4.75,13.0,0.16,7.1,90.2,26.4,293.0,158.0,2.7,0.7,58.0,8.6
2,2,120.0,80.0,1.37,1.25,2.66,22.238487,0.26,5.71,88.6,30.8,348.0,168.0,1.7,0.4,58.6,7.4
3,3,100.0,70.0,1.27,2.21,1.73,17.972003,0.26,4.13,93.0,33.2,354.0,152.0,2.0,0.1,45.4,3.6
4,4,110.0,80.0,0.8,1.87,2.21,23.059563,0.2,4.87,92.6,29.6,319.0,225.0,1.7,0.3,50.2,8.4


In [3]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
detect_cols = ['ssy', 'szy', 'xqdmddb', 'xqgmddb', 'xqgysz', '100005', '100007', '31', '315', '316', '317', '319', '33', '34', '37', '39']

Outliers_to_drop = detect_outliers(data, 2, detect_cols)


In [4]:
data.loc[Outliers_to_drop]

Unnamed: 0.1,Unnamed: 0,ssy,szy,xqdmddb,xqgmddb,xqgysz,100005,100007,31,315,316,317,319,33,34,37,39
11,17,98.0,54.0,1.18,1.46,3.10,44.100000,0.348000,4.30,96.7,31.6,327.0,274.0,3.800000,0.900000,52.00,9.00
46,72,128.0,71.0,0.56,1.43,2.02,42.300000,0.320000,5.22,66.9,22.0,330.0,260.0,1.900000,0.800000,64.60,10.80
65,98,144.0,96.0,1.69,0.96,3.02,12.500000,0.180000,6.18,87.3,25.5,293.0,206.0,2.700000,1.000000,55.30,12.40
74,114,115.0,72.0,3.15,1.20,3.38,42.300000,0.291000,5.18,83.6,29.3,351.0,288.0,3.700000,1.000000,49.30,10.70
80,122,129.0,82.0,3.31,0.98,3.41,42.300000,0.359000,5.97,78.7,27.5,349.0,224.0,3.800000,0.900000,48.90,9.30
110,173,120.0,76.0,3.80,0.95,2.41,38.900000,0.286000,5.32,87.6,29.9,341.0,347.0,1.700000,1.200000,58.00,17.00
182,303,109.0,60.0,2.30,1.18,2.69,12.300000,0.250000,4.88,89.2,25.6,287.0,233.0,4.200000,0.800000,57.20,6.60
254,404,140.0,90.0,2.87,1.04,2.40,15.000000,2.679921,4.40,74.0,24.0,308.0,193.0,1.200000,0.200000,49.50,8.90
256,406,187.0,107.0,3.66,1.70,2.50,13.100000,0.175000,3.83,98.3,35.5,345.0,229.0,2.000000,0.200000,76.60,2.20
327,511,136.0,92.0,3.49,1.53,2.26,10.900000,0.200000,5.13,100.3,29.2,291.0,211.0,4.100000,1.200000,50.20,11.20


In [5]:
data.shape

(17712, 17)

In [6]:
data = data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
data.shape

(16819, 17)

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,ssy,szy,xqdmddb,xqgmddb,xqgysz,100005,100007,31,315,316,317,319,33,34,37,39
0,0,165.0,100.0,2.08,1.29,3.24,12.9,0.26,4.82,85.4,28.2,330.0,255.0,2.0,0.2,65.6,2.6
1,1,141.0,97.0,2.64,1.36,4.75,13.0,0.16,7.1,90.2,26.4,293.0,158.0,2.7,0.7,58.0,8.6
2,2,120.0,80.0,1.37,1.25,2.66,22.238487,0.26,5.71,88.6,30.8,348.0,168.0,1.7,0.4,58.6,7.4
3,3,100.0,70.0,1.27,2.21,1.73,17.972003,0.26,4.13,93.0,33.2,354.0,152.0,2.0,0.1,45.4,3.6
4,4,110.0,80.0,0.8,1.87,2.21,23.059563,0.2,4.87,92.6,29.6,319.0,225.0,1.7,0.3,50.2,8.4


In [8]:
X = data.iloc[:, 6:]
y_1 = data.iloc[:, 1]
y_2 = data.iloc[:, 2]
y_3 = data.iloc[:, 3]
y_4 = data.iloc[:, 4]
y_5 = data.iloc[:, 5]

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y_1, test_size=0.3, random_state=33)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y_2, test_size=0.3, random_state=33)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y_3, test_size=0.3, random_state=33)
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X, y_4, test_size=0.3, random_state=33)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X, y_5, test_size=0.3, random_state=33)

In [9]:
# create dataset for lightgbm
lgb_train_1 = lgb.Dataset(X_train_1, y_train_1)
lgb_eval_1 = lgb.Dataset(X_test_1, y_test_1, reference=lgb_train_1)
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'max_depth': 6,
    'num_leaves': 50,
    'max_bin': 100,
    'min_data_in_leaf': 200,
    'metric': {'l2', 'auc'},
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [10]:
# train
print('Start training...')
gbm = lgb.train(params,
                lgb_train_1,
                valid_sets=lgb_eval_1,
                num_boost_round=2000,
                early_stopping_rounds=50)

Start training...
[1]	valid_0's auc: 1	valid_0's l2: 15679.4
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 1	valid_0's l2: 15679.4
[3]	valid_0's auc: 1	valid_0's l2: 15679.4
[4]	valid_0's auc: 1	valid_0's l2: 15679.4
[5]	valid_0's auc: 1	valid_0's l2: 15679.4
[6]	valid_0's auc: 1	valid_0's l2: 15679.4
[7]	valid_0's auc: 1	valid_0's l2: 15679.4
[8]	valid_0's auc: 1	valid_0's l2: 15679.4
[9]	valid_0's auc: 1	valid_0's l2: 15679.4
[10]	valid_0's auc: 1	valid_0's l2: 15679.4
[11]	valid_0's auc: 1	valid_0's l2: 15679.4
[12]	valid_0's auc: 1	valid_0's l2: 15679.4
[13]	valid_0's auc: 1	valid_0's l2: 15679.4
[14]	valid_0's auc: 1	valid_0's l2: 15679.4
[15]	valid_0's auc: 1	valid_0's l2: 15679.4
[16]	valid_0's auc: 1	valid_0's l2: 15679.4
[17]	valid_0's auc: 1	valid_0's l2: 15679.4
[18]	valid_0's auc: 1	valid_0's l2: 15679.4
[19]	valid_0's auc: 1	valid_0's l2: 15679.4
[20]	valid_0's auc: 1	valid_0's l2: 15679.4
[21]	valid_0's auc: 1	valid_0's l2: 15679.4
[22]	

In [18]:
print('Start predicting...')
preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)  # 输出的是概率结果
print(preds)

Start predicting...
[0.5 0.5 0.5 ... 0.5 0.5 0.5]
