In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# machine learning
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
path = 'data/allPredictFill.csv'
data = pd.read_csv(path, encoding = 'utf8')
data.head(5)

Unnamed: 0.1,Unnamed: 0,ssy,szy,xqdmddb,xqgmddb,xqgysz,100005,100007,31,315,316,317,319,33,34,37,39
0,0,165.0,100.0,2.08,1.29,3.24,12.9,0.26,4.82,85.4,28.2,330.0,255.0,2.0,0.2,65.6,2.6
1,1,141.0,97.0,2.64,1.36,4.75,13.0,0.16,7.1,90.2,26.4,293.0,158.0,2.7,0.7,58.0,8.6
2,2,120.0,80.0,1.37,1.25,2.66,22.238487,0.26,5.71,88.6,30.8,348.0,168.0,1.7,0.4,58.6,7.4
3,3,100.0,70.0,1.27,2.21,1.73,17.972003,0.26,4.13,93.0,33.2,354.0,152.0,2.0,0.1,45.4,3.6
4,4,110.0,80.0,0.8,1.87,2.21,23.059563,0.2,4.87,92.6,29.6,319.0,225.0,1.7,0.3,50.2,8.4


In [3]:
'''
筛选出DataFrame中的离群值
'''
def detect_outliers(df, n, features):

    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare
detect_cols = ['ssy', 'szy', 'xqdmddb', 'xqgmddb', 'xqgysz', '100005', '100007', '31', '315', '316', '317', '319', '33', '34', '37', '39']

Outliers_to_drop = detect_outliers(data, 2, detect_cols)


In [4]:
data.loc[Outliers_to_drop]

Unnamed: 0.1,Unnamed: 0,ssy,szy,xqdmddb,xqgmddb,xqgysz,100005,100007,31,315,316,317,319,33,34,37,39
11,17,98.0,54.0,1.18,1.46,3.10,44.100000,0.348000,4.30,96.7,31.6,327.0,274.0,3.800000,0.900000,52.00,9.00
46,72,128.0,71.0,0.56,1.43,2.02,42.300000,0.320000,5.22,66.9,22.0,330.0,260.0,1.900000,0.800000,64.60,10.80
65,98,144.0,96.0,1.69,0.96,3.02,12.500000,0.180000,6.18,87.3,25.5,293.0,206.0,2.700000,1.000000,55.30,12.40
74,114,115.0,72.0,3.15,1.20,3.38,42.300000,0.291000,5.18,83.6,29.3,351.0,288.0,3.700000,1.000000,49.30,10.70
80,122,129.0,82.0,3.31,0.98,3.41,42.300000,0.359000,5.97,78.7,27.5,349.0,224.0,3.800000,0.900000,48.90,9.30
110,173,120.0,76.0,3.80,0.95,2.41,38.900000,0.286000,5.32,87.6,29.9,341.0,347.0,1.700000,1.200000,58.00,17.00
182,303,109.0,60.0,2.30,1.18,2.69,12.300000,0.250000,4.88,89.2,25.6,287.0,233.0,4.200000,0.800000,57.20,6.60
254,404,140.0,90.0,2.87,1.04,2.40,15.000000,2.679921,4.40,74.0,24.0,308.0,193.0,1.200000,0.200000,49.50,8.90
256,406,187.0,107.0,3.66,1.70,2.50,13.100000,0.175000,3.83,98.3,35.5,345.0,229.0,2.000000,0.200000,76.60,2.20
327,511,136.0,92.0,3.49,1.53,2.26,10.900000,0.200000,5.13,100.3,29.2,291.0,211.0,4.100000,1.200000,50.20,11.20


In [5]:
data.shape

(17712, 17)

In [6]:
data = data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
data.shape

(16819, 17)

In [7]:
data_y = data.iloc[:, 1:6]
data_x = data.iloc[:, 6:]

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(
    data_x,
    data_y,
    test_size=0.15,
    random_state=33
)

In [9]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [10]:
gbr1 = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    loss='ls').fit(X_train, Y_train.iloc[:, 0:1])

gbr2 = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    loss='ls').fit(X_train, Y_train.iloc[:, 1:2])

gbr3 = GradientBoostingRegressor(
    n_estimators=190,
    learning_rate=0.05,
    max_depth=3,
    loss='ls').fit(X_train, Y_train.iloc[:, 2:3])

gbr4 = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    loss='ls').fit(X_train, Y_train.iloc[:, 3:4])

gbr5 = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    loss='ls').fit(X_train, Y_train.iloc[:, 4:5])

Y_predict1 = gbr1.predict(X_test)
Y_predict2 = gbr2.predict(X_test)
Y_predict3 = gbr3.predict(X_test)
Y_predict4 = gbr4.predict(X_test)
Y_predict5 = gbr5.predict(X_test)

print(Y_predict1)
print(Y_predict2)

  y = column_or_1d(y, warn=True)


[128.65220739 126.05868991 122.75514261 ... 120.6624363  128.4928736
 120.39785988]
[80.34130574 78.2674235  72.69367121 ... 71.27053166 79.825415
 71.40572848]


In [11]:
Y_predict1 = pd.DataFrame(data=Y_predict1, index=Y_test.index.tolist(),
                          columns=['ssy'])
Y_predict2 = pd.DataFrame(data=Y_predict2, index=Y_test.index.tolist(),
                          columns=['szy'])
Y_predict3 = pd.DataFrame(data=Y_predict3, index=Y_test.index.tolist(),
                          columns=['xqdmddb'])
Y_predict4 = pd.DataFrame(data=Y_predict4, index=Y_test.index.tolist(),
                          columns=['xqgmddb'])
Y_predict5 = pd.DataFrame(data=Y_predict5, index=Y_test.index.tolist(),
                          columns=['xqgysz'])

Y_predict = pd.concat([Y_predict1, Y_predict2, Y_predict3, Y_predict4, Y_predict5], axis=1)
print(Y_predict)

              ssy        szy   xqdmddb   xqgmddb    xqgysz
1885   128.652207  80.341306  2.134284  1.281648  2.984384
10337  126.058690  78.267423  1.692401  1.378106  2.665040
9840   122.755143  72.693671  1.294416  1.408332  2.983009
10054  124.838447  76.604432  1.527405  1.260208  2.640578
13320  133.481488  82.164252  1.854148  1.415008  2.673514
13459  123.499064  73.410255  1.454724  1.558266  2.819056
10764  131.836822  82.611256  2.279589  1.130810  3.004490
13756  129.638982  77.826853  1.722259  1.438751  2.491465
6969   128.612858  79.287979  1.985496  1.194961  3.183809
16188  129.262156  79.075351  1.965922  1.370067  3.027092
15137  126.356100  76.801394  1.560828  1.309224  2.927571
3373   128.031666  73.889341  1.711887  1.493594  2.744677
9249   122.481200  73.415432  1.420004  1.521177  2.355092
14154  131.582771  82.196359  1.779503  1.481520  2.835175
3419   129.870570  79.757034  2.213916  1.226476  2.734803
15545  125.611823  75.079627  1.428759  1.474482  3.0021

In [12]:
def calc_logloss(true_df, pred_df):
    loss_sum = 0
    rows = true_df.shape[0]
    for c in true_df.columns:
        t_log = true_df[c].apply(lambda x: math.log(x + 1))
        p_log = pred_df[c].apply(lambda x: math.log(x + 1))
        dif = p_log - t_log
        dif2 = dif.apply(lambda x: math.pow(x, 2))
        dif2_sum = dif2.sum()
        loss_item = dif2_sum / rows
        loss_sum += loss_item

        print(c ,'的loss：%f', loss_item)
    print('总的loss：', loss_sum)
    print('平均loss:', loss_sum / 5)

In [13]:
calc_logloss(Y_test, Y_predict)

# (u'ssy', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.01954506202450193)
# (u'szy', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.021368134798429958)
# (u'xqdmddb', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.11038485327277026)
# (u'xqgmddb', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.015194160596393341)
# (u'xqgysz', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.04903191895640811)
# ('\xe6\x80\xbb\xe7\x9a\x84loss\xef\xbc\x9a', 0.2155241296485036)
# ('\xe5\xb9\xb3\xe5\x9d\x87loss:', 0.04310482592970072)

(u'ssy', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.01924838825519551)
(u'szy', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.02145643822409589)
(u'xqdmddb', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.10437247377464429)
(u'xqgmddb', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.01575366333684263)
(u'xqgysz', '\xe7\x9a\x84loss\xef\xbc\x9a%f', 0.044820967788238304)
('\xe6\x80\xbb\xe7\x9a\x84loss\xef\xbc\x9a', 0.2056519313790166)
('\xe5\xb9\xb3\xe5\x9d\x87loss:', 0.04113038627580332)
