In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection  import train_test_split
from sklearn.preprocessing import StandardScaler

# machine learning
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
path = 'data/train_set.csv'
data = pd.read_csv(path, encoding='gbk')
data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(38199, 406)

In [3]:
del data['vid']
data.shape

(38199, 405)

In [4]:
#####################################################################
# 识别数值型的特征，将数值型的特征进行训练
#####################################################################
columns_float = []
for column in data.columns:
    if data[column].dtypes == 'float64':
        columns_float.append(column)

# print(columns_float)
data = data[columns_float]
data.shape

(38199, 105)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38199 entries, 0 to 38198
Columns: 105 entries, ssy to 979027
dtypes: float64(105)
memory usage: 30.6 MB


In [6]:
def getLostColumns(dF):
    '''
    超过缺失阈值的字段删除掉，只保留缺失值少的特征
    :param dF:
    :return:
    '''
    columns = dF.columns.values.tolist()
    thresh = 0.8
    exclude_feats = []
    num_rows = dF.shape[0]
    for c in dF.columns:
        num_missing = dF[c].isnull().sum()
        missing_percent = num_missing / float(num_rows)
        if missing_percent > thresh:
            exclude_feats.append(c)
    print("缺失数据的字段数量: %s" % len(exclude_feats))
    return exclude_feats

In [7]:
lost_cols = getLostColumns(data)

缺失数据的字段数量: 89


In [8]:
data = data.drop(lost_cols, axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38199 entries, 0 to 38198
Data columns (total 16 columns):
ssy        38192 non-null float64
szy        38192 non-null float64
xqdmddb    38199 non-null float64
xqgmddb    38199 non-null float64
xqgysz     38199 non-null float64
100005     12899 non-null float64
100007     15043 non-null float64
31         17712 non-null float64
315        17712 non-null float64
316        17712 non-null float64
317        17459 non-null float64
319        17712 non-null float64
33         17409 non-null float64
34         13957 non-null float64
37         17712 non-null float64
39         14101 non-null float64
dtypes: float64(16)
memory usage: 4.7 MB


In [9]:
def fill_missing_data_byPredict(df, fillColumn, dataColumns):
    print("fillColumn:")
    print(fillColumn)
#     print("dataColumns:")
#     print(dataColumns)
    needFill_df = df[fillColumn + dataColumns]
    
    known_df = needFill_df[needFill_df[fillColumn[0]].notnull()].as_matrix()
    unknown_df = needFill_df[needFill_df[fillColumn[0]].isnull()].as_matrix()
    
    y = known_df[:, 0] 
    X = known_df[:,1:]

    X_isNumpyNan = np.any(np.isnan(X))
    # adarray转pandas填充空值后再做预测
    if(X_isNumpyNan):
        X_df = pd.DataFrame(X, columns = dataColumns)
        X_df = X_df.fillna(X_df.mean())
        X = X_df.values
        
    unknown_X = unknown_df[:, 1:]
    unknown_X_isNumpyNan = np.any(np.isnan(unknown_X))
    if(unknown_X_isNumpyNan):
#         print("unknown_X——shape!")
#         print(unknown_X.shape)
        unknown_X_df = pd.DataFrame(unknown_X, columns = dataColumns)
#         print('unknown_X_df.mean():', unknown_X_df.mean())
        unknown_X_df = unknown_X_df.fillna(unknown_X_df.mean())
        unknown_X_df_isNumpyNan = np.any(np.isnan(unknown_X_df))
#         print('unknown_X_df_isNumpyNan:', unknown_X_df_isNumpyNan)
        unknown_X = unknown_X_df.values
        
        
    
    rfr = RandomForestRegressor(oob_score = True, n_jobs = -1,random_state =50,
                                max_features = "auto", min_samples_leaf = 5)
    rfr.fit(X, y)
    
    print('after fill unknown_X:')
    print(unknown_X)
    predictData = rfr.predict(unknown_X)
    df.loc[(df[fillColumn[0]].isnull()), fillColumn[0]] = predictData
    
    return df

# 筛选出31等字段为空的行

In [10]:
data = data[(data['31'].notnull())]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17712 entries, 0 to 38196
Data columns (total 16 columns):
ssy        17710 non-null float64
szy        17710 non-null float64
xqdmddb    17712 non-null float64
xqgmddb    17712 non-null float64
xqgysz     17712 non-null float64
100005     12511 non-null float64
100007     14563 non-null float64
31         17712 non-null float64
315        17712 non-null float64
316        17712 non-null float64
317        17459 non-null float64
319        17712 non-null float64
33         17405 non-null float64
34         13957 non-null float64
37         17712 non-null float64
39         14101 non-null float64
dtypes: float64(16)
memory usage: 2.3 MB


In [11]:
needfill_cols = ['ssy', 'szy', '100005', '100007', '317', '33', '34' ,'39']
fill_col_list = ['xqdmddb', 'xqgmddb', 'xqgysz', '31', '315', '316', '319', '37']

for fill_col in needfill_cols:
    fillColumn = []
    fillColumn.append(fill_col)
    print('fill_col：', fill_col)
    data = fill_missing_data_byPredict(data, fillColumn, fill_col_list)

('fill_col\xef\xbc\x9a', 'ssy')
fillColumn:
['ssy']


  warn("Some inputs do not have OOB scores. "


after fill unknown_X:
[[  1.15   1.21   3.91   5.23  94.4   31.1  200.    67.5 ]
 [  1.25   1.52   5.01   5.11  87.7   30.5  268.    53.  ]]
('fill_col\xef\xbc\x9a', 'szy')
fillColumn:
['szy']
after fill unknown_X:
[[  1.15   1.21   3.91   5.23  94.4   31.1  200.    67.5 ]
 [  1.25   1.52   5.01   5.11  87.7   30.5  268.    53.  ]]
('fill_col\xef\xbc\x9a', '100005')
fillColumn:
['100005']
after fill unknown_X:
[[  1.37   1.25   2.66 ...  30.8  168.    58.6 ]
 [  1.27   2.21   1.73 ...  33.2  152.    45.4 ]
 [  0.8    1.87   2.21 ...  29.6  225.    50.2 ]
 ...
 [  2.25   1.32   2.02 ...  32.3  194.    54.7 ]
 [  3.94   1.39   2.34 ...  29.3  250.    66.3 ]
 [  1.19   1.04   1.56 ...  31.8  204.    63.6 ]]
('fill_col\xef\xbc\x9a', '100007')
fillColumn:
['100007']
after fill unknown_X:
[[  1.99   1.12   2.39 ...  30.8  162.    74.6 ]
 [  1.61   1.66   2.67 ...  30.6  191.    57.7 ]
 [  1.21   1.97   2.49 ...  31.8  147.    58.4 ]
 ...
 [  2.3    1.07   3.05 ...  30.3  220.    72.7 ]
 [  0

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17712 entries, 0 to 38196
Data columns (total 16 columns):
ssy        17712 non-null float64
szy        17712 non-null float64
xqdmddb    17712 non-null float64
xqgmddb    17712 non-null float64
xqgysz     17712 non-null float64
100005     17712 non-null float64
100007     17712 non-null float64
31         17712 non-null float64
315        17712 non-null float64
316        17712 non-null float64
317        17712 non-null float64
319        17712 non-null float64
33         17712 non-null float64
34         17712 non-null float64
37         17712 non-null float64
39         17712 non-null float64
dtypes: float64(16)
memory usage: 2.3 MB


In [13]:
data.head(10)

Unnamed: 0,ssy,szy,xqdmddb,xqgmddb,xqgysz,100005,100007,31,315,316,317,319,33,34,37,39
0,165.0,100.0,2.08,1.29,3.24,12.9,0.26,4.82,85.4,28.2,330.0,255.0,2.0,0.2,65.6,2.6
1,141.0,97.0,2.64,1.36,4.75,13.0,0.16,7.1,90.2,26.4,293.0,158.0,2.7,0.7,58.0,8.6
2,120.0,80.0,1.37,1.25,2.66,22.238487,0.26,5.71,88.6,30.8,348.0,168.0,1.7,0.4,58.6,7.4
3,100.0,70.0,1.27,2.21,1.73,17.972003,0.26,4.13,93.0,33.2,354.0,152.0,2.0,0.1,45.4,3.6
4,110.0,80.0,0.8,1.87,2.21,23.059563,0.2,4.87,92.6,29.6,319.0,225.0,1.7,0.3,50.2,8.4
5,110.0,80.0,1.4,1.74,2.09,22.149885,0.13,4.78,96.9,33.9,350.0,319.0,2.0,0.7,48.4,13.9
7,111.0,71.0,3.36,1.08,2.88,10.6,0.22,5.32,91.7,30.6,334.0,210.0,1.6,0.7,66.6,10.0
13,132.0,73.0,0.84,1.71,2.87,46.3,0.241,3.82,91.6,31.9,348.0,270.0,1.5,0.4,66.9,7.0
14,155.0,84.0,1.85,1.12,4.23,37.2,0.371,4.5,83.3,28.4,341.0,145.0,2.2,0.4,57.0,8.0
15,127.0,80.0,0.69,1.67,2.54,42.7,0.243,4.63,92.4,31.1,336.0,320.0,1.6,0.5,66.9,7.8


In [14]:
data.to_csv('data/allPredictFill.csv')