In [211]:
import pandas as pd
import numpy as np
import random as rnd
import re

import seaborn as sns
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [2]:
train_df = pd.read_csv('../data_train.csv', low_memory=False, index_col=0)
test_df = pd.read_csv('../data_test.csv', low_memory=False, index_col=0)

In [7]:
print(train_df.shape, test_df.shape)
print(train_df.describe().shape, test_df.describe().shape)

# unique values of each feature
# for i in range(0,10):
#     print(i, train_df.iloc[:, i].unique().size)
# # print(train_df.iloc[:, i].dtype)

((38199, 262), (9538, 262))
((8, 18), (8, 43))


### numerical and null propotion of each feature

In [168]:
def isFloat(aString):
    try:
        float(aString)
        return True
    except:
        return False

isnumerical_df = train_df.applymap(lambda x: isFloat(x))
numerical_prop = (isnumerical_df.sum() - train_df.isnull().sum())/(train_df.shape[0] - train_df.isnull().sum())
null_prop = train_df.isnull().sum() / train_df.shape[0]

0     136.00
1     188.00
2        176
3        137
4        144
5        162
6        NaN
7     163.00
8        NaN
9        NaN
10       NaN
Name: 313, dtype: object


### select numerical features by numerical and null propotion

In [197]:
numeric_features = []
for i in range(train_df.shape[1]):
    if numerical_prop[i] > 0.8 and null_prop[i] < 0.8:
        numeric_features.append(i)
print('number of numeric features: ' + str(len(numeric_features)))

number of numeric features: 75


**look at a specific feature**

In [262]:
i = 13
print('featue id: %s, numerical proportion: %.5f, nan proportion: %.5f' % 
      (train_df.columns[numeric_features[i]], numerical_prop[numeric_features[i]], null_prop[numeric_features[i]]))
print('unique values:', train_df.iloc[:, numeric_features[i]].unique())

featue id: 1117, numerical proportion: 0.99877, nan proportion: 0.31713
('unique values:', array([  23.  ,  162.  ,  115.  , ...,  201.4 ,   43.28,   46.09]))


In [263]:
def toFloat(aString):
    try:
        result = float(aString)
        return result
    except:
        gro = re.search(r'(-?\d+\.?\d+)', aString)
        if gro:
            return float(gro.group(0))
        else:
            return float('NaN')


In [275]:
# update train/test set
train_df_new = train_df
for feature_idx in numeric_features[5:]:
    train_df_new.iloc[:, feature_idx] = train_df.iloc[:, feature_idx].map(toFloat)
    
test_df_new = test_df
for feature_idx in numeric_features[5:]:
    feature_id = train_df.columns[feature_idx]
    test_df_new.loc[:, feature_id] = test_df.loc[:, feature_id].map(toFloat)

In [264]:
train_df_new.describe()

Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白,0424,100005,100006,100007,10002,...,3193,32,320,33,34,37,38,39,669001,809009
count,38191.0,38190.0,38158.0,38199.0,38199.0,30245.0,12899.0,17473.0,15043.0,20109.0,...,36572.0,17553.0,16938.0,17409.0,13957.0,17712.0,17712.0,14101.0,5289.0,4858.0
mean,126.052918,77.023619,1.612536,1.406683,2.769719,72.601377,18.969464,14.448569,0.687269,9.183461,...,1.020535,3.768755,9.410126,2.076115,0.426795,57.697587,30.840653,7.730919,3.513815,1.62008
std,19.265082,13.385752,1.335518,0.341184,0.852217,9.175251,12.073726,2.659313,4.187223,4.083423,...,0.005984,1.258986,1.79191,0.737653,0.209125,13.335674,10.749796,6.360497,3.364076,0.803286
min,69.0,37.0,0.1,0.15,-1.22,0.64,4.0,0.0,0.0,0.14,...,1.003,0.43,0.0,0.2,0.09,0.31,0.12,0.01,0.01,0.37
25%,112.0,68.0,0.88,1.16,2.18,66.0,12.1,12.3,0.159,6.4,...,1.015,2.9,8.4,1.6,0.3,53.5,26.5,4.8,2.02,1.4
50%,124.0,76.0,1.27,1.35,2.69,72.0,13.0,15.2,0.201,8.5,...,1.02,3.6,9.5,2.0,0.4,59.3,32.1,6.7,3.09,1.48
75%,137.0,85.0,1.9,1.6,3.26,78.0,15.0,16.2,0.259,11.13,...,1.025,4.4,10.5,2.4,0.5,65.0,37.5,8.7,4.48,1.57
max,252.0,974.0,28.8,4.78,11.54,148.0,56.8,32.2,51.8,52.1,...,1.03,18.3,20.1,15.6,4.1,92.2,85.8,55.8,132.246,21.59


## 简单模型

In [269]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

feature = train_df_new.describe().columns.values.tolist()[5:]
label = train_df_new.describe().columns.values.tolist()[0:5]
X_train = train_df_new.loc[:,feature].fillna(0)
Y_train = train_df_new.loc[:,label].fillna(0)
X_test = test_df_new.loc[:,feature].fillna(0)
X_train.shape, Y_train.shape, X_test.shape

((38199, 72), (38199, 5), (9538, 72))

### 决策树

In [270]:
regr = DecisionTreeRegressor()
regr.fit(X_train, Y_train)
acc_decision_tree = round(regr.score(X_train, Y_train) * 100, 2)
acc_decision_tree

100.0

In [271]:
Y_pred_regr = regr.predict(X_test)
Y_pred_regr_df = pd.DataFrame(Y_pred_regr, columns=label)
Y_pred_regr_df.head()

Unnamed: 0,收缩压,舒张压,血清甘油三酯,血清高密度脂蛋白,血清低密度脂蛋白
0,130.0,88.0,1.46,1.61,2.3
1,139.0,83.0,1.09,1.38,1.99
2,157.0,93.0,2.27,1.39,3.71
3,106.0,60.0,0.65,1.47,1.98
4,127.0,88.0,2.1,1.8,3.72


### 随机森林

In [274]:
from sklearn.cross_validation import train_test_split
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train1, Y_train1)
Y_pred = random_forest.predict(X_test1)
random_forest.score(X_train1, Y_train1)
acc_random_forest = round(random_forest.score(X_train1, Y_train1) * 100, 2)
acc_random_forest

89.17