# 数据分区

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

In [2]:
data = pd.read_csv('data_keep_50000.csv', low_memory=False)

In [3]:
data.shape

(57298, 257)

一共256个特征，其中有11个是数值，245个是混合类型，需要处理。

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57298 entries, 0 to 57297
Columns: 257 entries, vid to A601
dtypes: float64(11), object(246)
memory usage: 112.3+ MB


### 数值数据的分布

In [5]:
data.describe()

Unnamed: 0,100005,100007,315,316,317,319,33,34,37,39,809009
count,19422.0,22608.0,26600.0,26600.0,26229.0,26600.0,26143.0,21004.0,26600.0,21216.0,7333.0
mean,18.967235,0.671348,90.613117,30.289935,334.470525,214.974899,2.076035,0.427824,57.626687,7.709535,1.618898
std,12.079368,4.09352,6.045505,2.474706,15.035377,55.625658,0.733745,0.209185,13.444135,6.31723,0.795881
min,0.0,0.0,53.2,10.3,116.0,28.0,0.2,0.06,0.31,0.01,0.37
25%,12.1,0.159,87.8,29.3,325.0,178.0,1.6,0.3,53.4,4.8,1.4
50%,13.0,0.201,91.0,30.5,334.0,212.0,2.0,0.4,59.3,6.7,1.48
75%,15.0,0.259,94.2,31.7,344.0,249.0,2.4,0.5,65.0,8.7,1.56
max,56.8,51.8,137.9,47.6,485.0,833.0,15.6,4.1,92.2,59.0,21.59


### 文本数据的分布

**有部分混合数据被分为object，需要再处理**

### 训练数据

In [6]:
train_df = pd.read_csv('meinian_round1_train_20180408.csv')
train_df.shape

(38199, 6)

In [7]:
train_df.describe(include='O')

Unnamed: 0,vid,收缩压,舒张压,血清甘油三酯
count,38199,38199,38199,38199.0
unique,38199,156,110,975.0
top,d9632ea47f425acdd7eed79f9ea17491,120,80,0.8
freq,1,1260,1747,362.0


有三列数据有混合类型，需要处理。

In [8]:
train_df[train_df['收缩压'].str.isnumeric()==False]

Unnamed: 0,vid,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
15756,1773a52f313d62c3f8d8b8eec1bd63b9,未查,未查,1.15,1.21,3.91
19645,1c1773e341fd21a8da7ac51223fc5b04,未查,未查,1.15,1.18,2.03
20652,8eee83b48c083e6e337c32afe5fec05f,未查,未查,0.47,0.78,1.08
20953,85b341038cfa4b03e6c625ecc2cefddb,弃查,弃查,1.13,1.43,2.92
21662,0597838c42f7c667f0b941efd42055a6,弃查,弃查,1.38,1.39,2.95
27521,34d2cd11dd80947d7a5ef1a42e4777bd,未查,未查,1.25,1.52,5.01
32672,eb16145ab9d131a8501f5df67586f3a6,未查,未查,1.63,1.39,3.79


In [9]:
train_df['收缩压'] = pd.to_numeric(train_df['收缩压'], errors='coerce')
train_df['舒张压'] = pd.to_numeric(train_df['舒张压'], errors='coerce')

In [10]:
train_df.describe(include='O')

Unnamed: 0,vid,血清甘油三酯
count,38199,38199.0
unique,38199,975.0
top,d9632ea47f425acdd7eed79f9ea17491,0.8
freq,1,362.0


In [11]:
train_df[train_df['血清甘油三酯'].str.contains(r'[\u4e00-\u9fa5]')]

Unnamed: 0,vid,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
22712,99848f481d731504457442dc79e95eee,133.0,88.0,7.75轻度乳糜,1.1,2.06


In [12]:
train_df.loc[22712, '血清甘油三酯'] = 7.75
train_df['血清甘油三酯'] = pd.to_numeric(train_df['血清甘油三酯'], errors='coerce')

In [13]:
train_df.describe()

Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
count,38192.0,38192.0,38158.0,38199.0,38199.0
mean,126.049618,79.642229,1.612536,1.406683,2.769719
std,19.275625,512.318203,1.335518,0.341184,0.852217
min,0.0,0.0,0.1,0.15,-1.22
25%,112.0,68.0,0.88,1.16,2.18
50%,124.0,76.0,1.27,1.35,2.69
75%,137.0,85.0,1.9,1.6,3.26
max,252.0,100164.0,28.8,4.78,11.54


In [14]:
train_df['舒张压'].sort_values(ascending=False)[:5]

22357    100164.0
4600        974.0
35452       148.0
33501       146.0
22516       144.0
Name: 舒张压, dtype: float64

In [15]:
train_df.loc[22357,:]

vid         7685d48685028a006c84070f68854ce1
收缩压                                      180
舒张压                                   100164
血清甘油三酯                                     4
血清高密度脂蛋白                                   2
血清低密度脂蛋白                                2.99
Name: 22357, dtype: object

数据不合理，把舒张压替换成NaN

In [16]:
train_df.loc[22357, '舒张压'] = np.nan

In [17]:
train_df['舒张压'].sort_values()[:5]

29394     0.0
5727     37.0
14966    38.0
13674    39.0
15837    40.0
Name: 舒张压, dtype: float64

In [18]:
train_df.loc[29394, '舒张压'] = np.nan

In [19]:
train_df['收缩压'].sort_values()[:5]

29394     0.0
5727     69.0
28325    73.0
5702     74.0
15053    75.0
Name: 收缩压, dtype: float64

In [20]:
train_df.loc[29394, '收缩压'] = np.nan

In [21]:
train_df['血清甘油三酯'].sort_values(ascending=False)[:5]

17693    28.80
3728     25.03
6134     25.01
13137    23.58
30922    22.55
Name: 血清甘油三酯, dtype: float64

In [22]:
train_df['血清低密度脂蛋白'].sort_values()[:5]

11688   -1.22
14065   -0.12
15215   -0.06
36423    0.08
9598     0.09
Name: 血清低密度脂蛋白, dtype: float64

In [23]:
train_df.describe()

Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
count,38191.0,38190.0,38158.0,38199.0,38199.0
mean,126.052918,77.023619,1.612536,1.406683,2.769719
std,19.265082,13.385752,1.335518,0.341184,0.852217
min,69.0,37.0,0.1,0.15,-1.22
25%,112.0,68.0,0.88,1.16,2.18
50%,124.0,76.0,1.27,1.35,2.69
75%,137.0,85.0,1.9,1.6,3.26
max,252.0,974.0,28.8,4.78,11.54


In [24]:
test_df = pd.read_csv('meinian_round1_test_a_20180409.csv')
test_df.shape

(9538, 6)

### 数据拼接

需要把训练集和测试集数据分别拼接起来。

In [25]:
merged_train_df = pd.merge(train_df, data, on='vid', sort=False)

In [26]:
merged_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38199 entries, 0 to 38198
Columns: 262 entries, vid to A601
dtypes: float64(16), object(246)
memory usage: 76.6+ MB


In [27]:
merged_test_df = pd.merge(test_df, data, on='vid', sort=False)

In [28]:
merged_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9538 entries, 0 to 9537
Columns: 262 entries, vid to A601
dtypes: float64(16), object(246)
memory usage: 19.1+ MB


因为在数据导入的时候，每一列只要有非数值数据，就会被转化成 object 类型。  
所以需要统计每一列 object 类型的数据的数值数据比例，大于某值（如0.8）则转化成数值，不能转化的变成 NaN。

In [29]:
combine = [merged_train_df, merged_test_df]

numerical_feature = []
train_data_counts = merged_train_df.shape[0]

for col in merged_train_df.columns.values:
    num_counts = merged_train_df[col].astype(
        str).str.match(r'^(-?\d+)(\.\d+)?$').sum()
    na_counts = merged_train_df[col].isna().sum()

    if num_counts / (train_data_counts - na_counts) > 0.8:
        numerical_feature.append(col)

for df in combine:
    df[numerical_feature[5:]] = df[numerical_feature[5:]].apply(
        lambda x: pd.to_numeric(x, downcast='float', errors='coerce'))

### 异常数据点的处理

In [30]:
merged_train_df.describe()

Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白,0424,100005,100006,100007,100014,10002,10003,10004,1106,1107,1115,1117,1127,1321,1322,1325,1326,1345,139,143,1474,155,1814,1815,183,1840,1845,1850,190,191,192,193,2174,2333,2372,2403,2404,2405,2406,2420,269003,269004,269005,269006,269007,269008,269009,269010,269011,269012,269013,269014,269015,269016,269017,269018,269019,269020,269021,269022,269023,269024,269025,300017,300021,31,312,313,314,315,316,317,319,3193,32,320,33,34,37,38,39,669001,669002,669006,809001,809009,979001,979002,979003,979004,979005,979006,979007,979008,979009,979011,979012,979013,979014,979015,979016,979017,979018,979019,979020,979021,979022,979023
count,38191.0,38190.0,38158.0,38199.0,38199.0,25078.0,12899.0,17472.0,15043.0,4932.0,20102.0,18402.0,34709.0,5606.0,5479.0,22944.0,26053.0,8201.0,16549.0,16546.0,6121.0,6113.0,8733.0,5641.0,5415.0,5056.0,8770.0,37919.0,32801.0,20697.0,36650.0,17139.0,35070.0,37450.0,37039.0,29782.0,25778.0,20199.0,17476.0,17331.0,36481.0,36482.0,36316.0,13074.0,9756.0,8217.0,8361.0,8057.0,8218.0,7287.0,8218.0,8218.0,8218.0,7744.0,8217.0,8217.0,8214.0,8358.0,8214.0,8218.0,8362.0,8218.0,8362.0,8361.0,8361.0,8361.0,8361.0,8315.0,4904.0,5754.0,17712.0,17539.0,17458.0,20556.0,17712.0,17712.0,17459.0,17712.0,36557.0,17552.0,16938.0,17409.0,13957.0,17712.0,17711.0,14101.0,5289.0,5366.0,5144.0,5649.0,4858.0,6829.0,6701.0,6866.0,6867.0,6283.0,6612.0,6866.0,6867.0,6867.0,6867.0,6867.0,6764.0,6764.0,6867.0,6868.0,6867.0,6763.0,6763.0,6867.0,6868.0,6867.0,6867.0
mean,126.052918,77.023619,1.612536,1.406683,2.769719,73.228432,18.969465,14.448467,0.687269,15.40041,9.180971,28.616533,4.787577,1.447176,0.957998,75.163383,34.425449,118.602715,0.817138,0.826052,0.916509,0.920244,173.181213,1.592708,13.658928,93.209442,2.268939,28.52206,23.708757,75.251366,6.114405,1.664113,5.27836,70.513161,333.591736,13.246148,4.063509,46.600586,1.724448,3.317938,68.311508,165.125534,24.941238,58.026318,78.209938,15.227648,9.235576,0.196667,214.192261,46.272881,340.020447,30.710417,90.246155,42.041161,145.772537,4.754754,3.714718,0.397313,1.802385,60.325745,6.579688,30.182653,6.097449,2.694441,0.233221,0.165106,0.012576,13.340771,1.134809,10.243145,4.736106,6.258362,143.23024,39.768288,90.603508,30.289684,334.513489,215.117477,1.020532,3.76885,9.410127,2.076115,0.426795,57.69759,30.841913,7.730919,3.513814,1.484792,7.278165,6.435024,1.62008,15.402506,0.192418,9.229942,214.214539,12.151904,39.872089,325.290802,30.152962,91.505928,142.020096,4.79113,0.032268,0.154202,3.520025,0.354889,2.060699,0.547834,2.497921,56.981411,5.831799,34.172333,6.119901
std,19.265082,13.385752,1.335518,0.341184,0.852217,9.008519,12.073726,2.659356,4.187223,5.268567,4.078198,4.205136,9.769779,0.257332,0.245131,24.130157,41.370541,156.473068,0.376352,0.375329,0.276406,0.271478,32.151905,0.540641,7.633927,31.702618,2.660006,25.590902,14.127274,4.846767,0.69313,0.28975,1.335808,18.37723,94.163933,5.517871,2.298907,3.148903,2.616732,5.316967,299.103973,8.689532,112.786636,7.95576,11.329891,3.243238,1.476788,0.130493,55.678726,4.621803,15.235557,2.594384,6.18049,7.38495,17.250828,0.523364,1.277932,0.149948,0.610762,8.814663,1.976304,8.30533,1.616682,2.199247,0.334813,0.161038,0.0214,1.652637,1.393548,8.362729,0.523044,1.614792,16.769623,12.016124,6.050279,2.471872,15.049162,55.691769,0.005982,1.258959,1.79191,0.737653,0.209125,13.335673,10.748792,6.360497,3.364076,1.251164,6.602274,3.904337,0.803286,2.341264,0.061091,1.515906,60.023293,1.099067,9.050881,40.008274,2.493796,6.626197,23.335838,0.527902,0.023906,0.14588,1.175772,0.128543,0.603849,0.337766,2.096195,7.926914,1.566178,7.412165,1.570924
min,69.0,37.0,0.1,0.15,-1.22,0.64,4.0,0.0,0.0,0.42,0.14,7.2,-1799.76001,0.68,0.03,1.8,0.0,15.0,0.01,0.01,0.01,0.01,81.0,0.049,0.6,1.13,0.0,1.0,0.3,22.58,4.5,0.05,2.83,6.15,36.0,0.11,0.0,15.0,0.0,0.0,0.0,0.0,0.0,-90.0,21.700001,0.1,4.1,0.02,28.0,33.200001,263.0,15.8,54.799999,0.292,59.0,2.85,1.14,0.05,0.31,16.299999,0.9,3.3,2.25,0.0,0.0,0.0,0.0,9.1,0.0,0.0,2.22,1.3,45.0,0.23,53.200001,10.3,116.0,28.0,1.003,0.43,0.0,0.2,0.09,0.31,0.12,0.01,0.01,0.06,0.01,2.51,0.37,0.0,0.01,0.0,12.0,9.6,11.1,3.19,14.3,53.900002,9.7,1.28,0.0,0.0,0.56,0.02,0.36,0.0,0.0,28.0,0.6,5.7,1.71
25%,112.0,68.0,0.88,1.16,2.18,68.0,12.1,12.3,0.159,12.79,6.4,25.76,3.94,1.26,0.78,59.0,15.14,71.489998,0.5,0.6,0.8,0.8,152.020004,1.28,9.7,78.39575,1.11,16.0,18.0,72.059998,6.0,1.47,4.63,57.0,264.0,9.5,2.6925,44.5,0.9215,1.76,57.0,159.0,21.799999,52.200001,70.0,13.2,8.5,0.16,176.0,42.900002,330.0,29.700001,87.699997,39.400002,134.0,4.38,2.83,0.3,1.38,54.400002,5.2,24.6,4.9725,1.3,0.1,0.08,0.0,12.2,0.503,4.57,4.36,5.1,132.0,38.799999,87.699997,29.299999,325.0,178.0,1.015,2.9,8.4,1.6,0.3,53.5,26.5,4.8,2.02,0.73,3.2,4.7,1.4,15.5,0.16,8.3,174.0,11.4,39.400002,322.0,29.299999,88.800003,132.0,4.42,0.02,0.07,2.71,0.27,1.64,0.3,1.2,51.799999,4.8,29.200001,5.05
50%,124.0,76.0,1.27,1.35,2.69,72.0,13.0,15.2,0.201,15.605,8.5,28.5,4.7,1.425,0.94,71.735001,23.0,98.0,0.8,0.8,1.0,1.0,168.179993,1.54,12.0,93.699997,1.79,22.0,21.280001,75.099998,6.0,1.64,5.03,69.0,324.350006,12.31,3.7,46.599998,1.48,2.74,65.599998,165.0,24.200001,58.099998,77.0,15.0,9.3,0.19,209.0,46.200001,340.0,31.0,90.699997,42.830002,147.0,4.73,3.5325,0.38,1.73,60.299999,6.4,29.94,5.9,2.1,0.1,0.12,0.01,13.0,0.84,8.585,4.71,6.07,144.0,42.5,91.0,30.5,334.0,212.0,1.02,3.6,9.5,2.0,0.4,59.299999,32.099998,6.7,3.09,1.19,5.5005,5.18,1.48,16.0,0.187,9.1,211.0,12.0,41.700001,329.0,30.4,92.300003,145.0,4.77,0.03,0.11,3.31,0.33,1.99,0.5,1.94,57.099998,5.6,34.0,5.89
75%,137.0,85.0,1.9,1.6,3.26,78.0,15.0,16.200001,0.259,18.032501,11.13,31.299999,5.58,1.6,1.1,87.0,38.400002,126.650002,1.0,1.0,1.0,1.0,191.0,1.82,15.0,109.434999,2.79,33.019999,26.0,78.389999,6.5,1.83,5.5,81.25,394.0,15.78,5.01,48.700001,2.24,4.21,75.0,171.0,26.6,63.5,85.0,16.6,10.1,0.22,247.0,49.599998,350.0,32.200001,94.0,46.119999,158.0,5.1,4.36,0.48,2.16,66.199997,7.7,35.700001,7.0,3.4,0.2,0.2,0.01,14.3,1.3305,13.5875,5.08,7.15,156.0,46.0,94.199997,31.700001,344.0,249.0,1.025,4.4,10.5,2.4,0.5,65.0,37.5,8.7,4.48,1.92,9.2825,5.77,1.57,16.6,0.22,10.2,250.0,12.6,44.200001,338.0,31.6,95.5,156.0,5.14,0.04,0.19,4.1,0.42,2.39,0.7,3.1,62.200001,6.7,38.900002,6.96
max,252.0,974.0,28.8,4.78,11.54,128.0,56.799999,32.200001,51.799999,145.201996,52.099998,79.699997,19.790001,3.22,2.36,733.0,1642.900024,6303.0,2.0,2.0,2.0,2.0,602.0,12.349,98.900002,448.98999,100.0,1823.180054,1502.23999,169.899994,9.0,5.63,23.01,795.299988,943.0,160.630005,141.199997,65.0,292.0,375.579987,57142.0,199.399994,21507.0,89.5,162.0,38.599998,15.1,6.5,832.0,68.699997,405.0,42.200001,117.800003,60.650002,209.0,7.84,20.25,2.2,11.21,99.300003,20.200001,78.900002,26.1,44.799999,11.0,6.12,0.34,26.5,45.617001,125.0,7.64,24.200001,215.0,64.0,120.300003,42.900002,485.0,610.0,1.03,18.299999,20.1,15.6,4.1,92.199997,85.800003,55.799999,132.246002,46.009998,106.120003,31.34,21.59,25.34,2.08,15.54,704.0,21.700001,78.699997,423.0,40.900002,126.900002,201.0,7.56,0.3,3.35,16.444,1.55,7.97,2.8,40.5,89.0,15.91,64.599998,19.559999


In [31]:
merged_train_df['10004'].sort_values(ascending=True)[:5]

21234   -1799.76001
11101       1.19000
10352       1.38000
19802       1.44000
35939       1.52000
Name: 10004, dtype: float32

In [32]:
merged_train_df.loc[21234, '10004'] = np.nan

In [33]:
merged_train_df['2403'].sort_values(ascending=False)[:5]

21196    57142.000000
24805      154.000000
31084      151.100006
27379      150.100006
25556      147.600006
Name: 2403, dtype: float32

In [34]:
merged_train_df.loc[21196, '2403'] = np.nan

In [35]:
merged_train_df['2405'].sort_values(ascending=False)[:5]

21196    21507.000000
32136       54.700001
8087        49.299999
9416        46.500000
24805       45.000000
Name: 2405, dtype: float32

In [36]:
merged_train_df.loc[21196, '2405'] = np.nan

In [37]:
merged_test_df.describe()

Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白,0424,100005,100006,100007,100014,10002,10003,10004,1106,1107,1115,1117,1127,1321,1322,1325,1326,1345,139,143,1474,155,1814,1815,183,1840,1845,1850,190,191,192,193,2174,2333,2372,2403,2404,2405,2406,2420,269003,269004,269005,269006,269007,269008,269009,269010,269011,269012,269013,269014,269015,269016,269017,269018,269019,269020,269021,269022,269023,269024,269025,300017,300021,31,312,313,314,315,316,317,319,3193,32,320,33,34,37,38,39,669001,669002,669006,809001,809009,979001,979002,979003,979004,979005,979006,979007,979008,979009,979011,979012,979013,979014,979015,979016,979017,979018,979019,979020,979021,979022,979023
count,0.0,0.0,0.0,0.0,0.0,6226.0,3249.0,4372.0,3763.0,1215.0,5040.0,4582.0,8653.0,1364.0,1334.0,5736.0,6515.0,1979.0,4171.0,4174.0,1472.0,1470.0,2117.0,1447.0,1291.0,1307.0,2171.0,9457.0,8234.0,5139.0,9151.0,4267.0,8737.0,9344.0,9245.0,7437.0,6426.0,5012.0,4381.0,4324.0,9096.0,9096.0,9051.0,3323.0,2481.0,2015.0,2047.0,1980.0,2016.0,1775.0,2016.0,2016.0,2016.0,1910.0,2016.0,2015.0,2014.0,2045.0,2014.0,2016.0,2047.0,2016.0,2047.0,2047.0,2047.0,2047.0,2047.0,2037.0,1233.0,1434.0,4434.0,4395.0,4371.0,5101.0,4434.0,4434.0,4371.0,4434.0,9123.0,4394.0,4267.0,4363.0,3528.0,4434.0,4434.0,3559.0,1269.0,1289.0,1227.0,1449.0,1257.0,1737.0,1693.0,1744.0,1745.0,1579.0,1669.0,1745.0,1745.0,1745.0,1745.0,1745.0,1711.0,1711.0,1745.0,1745.0,1745.0,1710.0,1710.0,1745.0,1745.0,1745.0,1745.0
mean,,,,,,73.167038,19.044073,14.465755,0.639326,15.469995,9.17766,28.575964,4.850443,1.45643,0.973028,75.305145,34.06464,111.253899,0.818339,0.833682,0.914293,0.91698,173.189865,1.602104,13.921929,93.510193,2.298941,28.194424,23.566305,75.250336,6.103377,1.669909,5.271622,70.620667,334.48056,13.242046,4.036046,46.645569,1.716293,3.249072,66.660454,165.030075,24.349464,57.921127,78.094032,15.254146,9.242012,0.193128,212.887848,46.146084,340.847229,30.804811,90.268356,42.265617,146.61409,4.771128,3.702415,0.394585,1.81257,60.18259,6.520434,30.334812,6.093307,2.711465,0.231568,0.166135,0.012847,13.347914,1.149616,10.04662,4.739432,6.277516,142.969864,39.841846,90.537903,30.242403,334.134521,214.913849,1.020538,3.783976,9.417546,2.078981,0.427678,57.471344,30.652445,7.576245,3.586543,1.506105,7.762427,6.302767,1.59864,15.372101,0.190882,9.261462,213.01651,12.131538,39.58408,325.339447,30.243736,91.740448,142.100388,4.77749,0.032489,0.15158,3.509603,0.353081,2.054545,0.548614,2.464152,56.967426,5.834905,34.241501,6.097507
std,,,,,,8.987348,12.156971,2.661384,3.875273,5.915576,4.14955,4.207769,1.301552,0.263987,0.262275,23.889549,37.607773,67.720604,0.382742,0.376801,0.263051,0.265523,33.743542,0.515138,8.31296,31.781656,2.493092,22.530043,11.285617,4.879272,0.680129,0.29262,1.316012,20.362608,94.957878,5.475301,2.027271,3.21232,1.426009,3.317561,12.759216,8.877811,3.539409,8.171227,11.128667,3.36533,1.485286,0.051775,54.854378,4.678848,15.683137,2.495205,5.961137,7.080872,16.941759,0.521393,1.212102,0.149875,0.586123,8.728309,1.976968,8.167378,1.540953,2.20578,0.298584,0.166724,0.022571,1.68641,1.402315,8.324814,0.515019,1.608065,16.413521,11.87491,6.003174,2.464576,14.95288,55.968987,0.006012,1.250886,1.791144,0.733974,0.203998,13.934575,10.821147,6.206759,2.795492,1.134165,8.325024,3.757018,0.740983,2.341707,0.047923,1.512505,58.509899,1.084208,9.36167,40.298973,2.413367,6.431439,23.411612,0.520832,0.03517,0.146861,1.173711,0.121018,0.599263,0.343242,2.042699,8.060719,1.502377,7.510636,1.556797
min,,,,,,46.0,0.0,7.7,0.042,0.36,0.27,15.57,1.59,0.6,0.0,22.629999,5.0,13.08,0.01,0.01,0.02,0.01,42.0,0.63,0.5,4.47,0.0,0.0,1.0,58.700001,4.5,0.52,2.31,26.0,32.41,2.07,0.1,35.709999,0.01,0.02,0.0,0.0,0.0,-90.0,28.299999,5.0,4.2,0.05,0.9,33.599998,164.0,18.1,59.099998,0.337,71.0,2.82,0.32,0.05,0.33,7.6,1.0,5.0,2.89,0.0,0.0,0.0,0.0,9.5,0.0,0.0,2.92,2.3,62.0,0.26,57.0,15.8,256.0,35.0,1.0,0.5,3.8,0.38,0.1,0.35,0.14,0.02,0.03,0.0,0.1,2.55,0.47,6.8,0.043,4.7,47.0,9.9,11.3,30.1,17.200001,58.099998,8.7,2.88,0.0,0.0,0.88,0.09,0.51,0.0,0.0,33.400002,1.1,12.2,2.22
25%,,,,,,68.0,12.1,12.3225,0.1595,12.605,6.4,25.799999,3.94,1.2775,0.8,58.247499,15.89,71.665001,0.5,0.6,0.8,0.8,152.020004,1.31,9.8,77.389999,1.1215,15.9,18.0,72.0,6.0,1.47,4.62,57.0,264.0,9.5,2.62,44.5,0.92,1.73,57.0,158.575005,21.799999,52.200001,70.0,13.2,8.5,0.16,176.0,42.65,331.0,29.799999,87.574999,39.470001,134.0,4.39,2.86,0.3,1.4,54.200001,5.2,24.700001,5.01,1.3,0.1,0.08,0.0,12.1,0.51,4.37,4.37,5.2,131.0,38.799999,87.699997,29.200001,325.0,179.0,1.015,2.9,8.4,1.6,0.3,53.5,26.299999,4.8,2.06,0.73,3.3,4.64,1.39,15.5,0.159,8.3,174.0,11.4,39.400002,322.0,29.4,89.099998,132.0,4.41,0.02,0.07,2.68,0.27,1.63,0.3,1.2,51.799999,4.8,29.200001,5.05
50%,,,,,,72.0,13.0,15.2,0.202,15.49,8.405,28.5,4.7,1.43,0.96,72.0,23.0,99.0,0.8,0.8,1.0,1.0,169.0,1.56,12.1,94.269997,1.82,22.049999,21.4,75.099998,6.0,1.64,5.03,68.599998,325.380005,12.2,3.685,46.599998,1.49,2.74,65.699997,165.0,24.200001,58.099998,77.0,15.0,9.2,0.19,208.0,46.0,341.0,31.0,90.800003,42.919998,147.0,4.75,3.52,0.38,1.75,60.41,6.3,30.1,5.89,2.2,0.1,0.12,0.01,13.0,0.85,8.667,4.72,6.1,144.0,42.5,91.0,30.5,334.0,212.0,1.02,3.6,9.5,2.0,0.4,59.400002,32.0,6.6,3.14,1.21,5.74,5.1,1.48,16.1,0.186,9.1,209.0,12.0,41.799999,329.0,30.5,92.5,145.0,4.75,0.03,0.11,3.32,0.33,1.99,0.5,1.9,56.900002,5.8,34.200001,5.86
75%,,,,,,78.0,15.0,16.200001,0.26,18.0595,11.2,31.200001,5.6,1.6,1.12,88.0,38.0,129.0,1.0,1.0,1.0,1.0,189.550003,1.84,15.0,111.014999,2.77,33.0,26.0,78.400002,6.5,1.84,5.49,81.0,394.0,15.9,5.0475,48.799999,2.26,4.2,75.0,171.0,26.6,63.5,85.0,16.55,10.1,0.22,244.0,49.599998,350.0,32.200001,94.0,46.400002,159.0,5.13,4.3675,0.48,2.14,66.0,7.7,35.799999,6.98,3.3,0.2,0.2,0.01,14.3,1.35,13.1475,5.08,7.2,155.0,46.099998,94.099998,31.700001,343.0,247.0,1.025,4.4,10.5,2.4,0.5,65.0,37.375001,8.7,4.58,1.98,9.96,5.7,1.56,16.6,0.22,10.125,249.0,12.6,44.099998,338.0,31.6,95.5,156.0,5.11,0.04,0.19,4.1,0.42,2.372,0.7,3.1,62.48,6.7,39.200001,6.98
max,,,,,,126.0,56.700001,30.1,48.200001,97.879997,53.700001,59.900002,17.280001,2.91,3.39,392.0,941.119995,1238.0,2.0,2.0,2.0,1.5,642.0,9.02,111.800003,290.369995,50.040001,653.0,307.600006,136.399994,9.0,3.68,21.33,910.179993,830.0,71.5,51.0,80.970001,60.02,141.800003,134.0,195.5,43.299999,81.0,132.0,40.599998,16.1,0.59,560.0,60.099998,461.0,43.099998,113.0,61.240002,201.0,6.97,12.9,1.8,4.68,85.900002,30.799999,62.5,16.9,31.1,2.1,3.59,0.4,24.1,25.76,86.099998,7.65,15.99,200.0,66.099998,111.199997,43.099998,452.0,833.0,1.03,11.95,18.9,11.7,1.5,89.199997,79.800003,59.0,53.529999,9.65,151.199997,26.700001,10.03,30.0,0.51,16.700001,618.0,21.4,58.099998,393.0,37.400002,118.199997,202.0,7.01,1.1,2.577,10.54,0.97,6.04,2.6,27.559999,84.699997,16.799999,60.200001,13.69


In [38]:
merged_train_df.to_csv('data_train.csv')
merged_test_df.to_csv('data_test.csv')