In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew
import matplotlib.pyplot as plt

In [86]:
train = pd.read_csv('height_train.csv')
test = pd.read_csv('height_test.csv')
train.head()

Unnamed: 0,id,father_height,mother_height,boy_dummy,child_height
0,0,1.76,1.6,0,1.66
1,1,1.71,1.63,1,1.76
2,2,1.7,1.66,0,1.67
3,3,1.68,1.61,1,1.75
4,4,1.72,1.7,1,1.76


In [87]:
full = pd.concat([train, test], ignore_index=True)
full.drop(['id', 'child_height'], axis=1, inplace=True)
full.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,boy_dummy,father_height,mother_height
0,0,1.76,1.6
1,1,1.71,1.63
2,0,1.7,1.66
3,1,1.68,1.61
4,1,1.72,1.7


In [88]:
class add_feature(BaseEstimator, TransformerMixin):
    def __init__(self, additional=1):
        self.additional = additional
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.additional == 1:
            X['father_dummy'] = X.father_height * X.boy_dummy
            X['mother_dummy'] = X.mother_height * X.boy_dummy
        else:
            X['father_dummy'] = X.father_height * X.boy_dummy
            X['mother_dummy'] = X.mother_height * X.boy_dummy
            X['father_mother'] = X.father_height + X.mother_height
            X['father_mother_mul'] = X.father_height * X.mother_height
            X['father_2'] = X.father_height * X.father_height
            X['mother_2'] = X.mother_height * X.mother_height
            # X['father_3'] = X.father_height * X.father_height * X.father_height
            # X['mother_3'] = X.mother_height * X.mother_height * X.mother_height
        return X

In [89]:
add_feature(additional=2).fit_transform(full)
full.head()

Unnamed: 0,boy_dummy,father_height,mother_height,father_dummy,mother_dummy,father_mother,father_mother_mul,father_2,mother_2
0,0,1.76,1.6,0.0,0.0,3.36,2.816,3.0976,2.56
1,1,1.71,1.63,1.71,1.63,3.34,2.7873,2.9241,2.6569
2,0,1.7,1.66,0.0,0.0,3.36,2.822,2.89,2.7556
3,1,1.68,1.61,1.68,1.61,3.29,2.7048,2.8224,2.5921
4,1,1.72,1.7,1.72,1.7,3.42,2.924,2.9584,2.89


In [90]:
full.apply(lambda x: skew(x))

boy_dummy           -0.028670
father_height        0.172782
mother_height       -0.102162
father_dummy        -0.024197
mother_dummy        -0.024209
father_mother        0.016647
father_mother_mul    0.075891
father_2             0.255062
mother_2            -0.021984
dtype: float64

In [91]:
n_train = train.shape[0]
X = full[:n_train]
y = train.child_height

testX = full[n_train:]
print(X.shape)
print(y.shape)
print(testX.shape)

(5000, 9)
(5000,)
(1000, 9)


#### LinearRegression

In [92]:
X.head()

Unnamed: 0,boy_dummy,father_height,mother_height,father_dummy,mother_dummy,father_mother,father_mother_mul,father_2,mother_2
0,0,1.76,1.6,0.0,0.0,3.36,2.816,3.0976,2.56
1,1,1.71,1.63,1.71,1.63,3.34,2.7873,2.9241,2.6569
2,0,1.7,1.66,0.0,0.0,3.36,2.822,2.89,2.7556
3,1,1.68,1.61,1.68,1.61,3.29,2.7048,2.8224,2.5921
4,1,1.72,1.7,1.72,1.7,3.42,2.924,2.9584,2.89


In [93]:
model = LinearRegression()
model.fit(X=X ,y=y)

# 2. 直接使用模型的predict()
predY = model.predict(testX)

# 评估验证集分数 1.170883722882294
# np.square(predY*100 - np.array(valY.values.tolist())*100).mean()
result_lin_reg = pd.DataFrame({'id':test.id, 'prediction':predY})
result_lin_reg.to_csv('吴加清.csv', index=False)
# 1.0331128641782532

In [94]:
model.coef_

array([ 8.58745688e-02, -1.04356325e+11, -1.04356325e+11,  1.03851318e-01,
       -1.01760864e-01,  1.04356325e+11,  7.04060197e-02,  4.75031137e-03,
       -1.16656125e-02])

#### RandomForestRegressor

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
model = RandomForestRegressor()
model.fit(X=X, y=y)
predY = model.predict(testX)
result_rf = pd.DataFrame({'id':test.id, 'prediction':predY})
result_rf.to_csv('吴加清_rf.csv', index=False)
# 1.2169155350965841



#### Ridge

In [22]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge

In [23]:
class grid():
    def __init__(self, model):
        self.model = model
    
    def grid_get(self, X, y, param_grid):
        grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)
        print(grid_search.best_params_, grid_search.best_score_)

In [29]:
param_grid = {'alpha':[0.0001, 0.005, 0.01, 0.04, 0.08, 0.1, 1, 10, 30, 50, 70, 90, 110]}
grid(Ridge()).grid_get(X, y, param_grid)

{'alpha': 0.04} -0.0001107130546730021


In [31]:
param_grid = {'alpha':[0.0001, 0.005, 0.01, 0.04, 0.08, 0.1, 1, 10, 30, 50, 70, 90, 110]}
grid(Lasso()).grid_get(X, y, param_grid)

{'alpha': 0.0001} -0.00011762988614521537


In [33]:
param_grid = {'alpha':[0.0001, 0.005, 0.01, 0.04, 0.08, 0.1, 1, 10, 30, 50, 70, 90, 110],
              'l1_ratio':[0.0001, 0.005, 0.01, 0.04, 0.08, 0.1, 1, 10, 30, 50, 70, 90, 110]}
grid(ElasticNet()).grid_get(X, y, param_grid)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


{'alpha': 0.0001, 'l1_ratio': 0.0001} -0.00011182018807326558


  positive)


In [38]:
param_grid = {'alpha':[0.01, 0.04, 0.08, 0.1],
              'degree':[3],
              'coef0':[0.5, 1, 1.5],
              'kernel':['linear', 'polynomial']}
grid(KernelRidge()).grid_get(X, y, param_grid)

{'alpha': 0.01, 'coef0': 1.5, 'degree': 3, 'kernel': 'polynomial'} -0.00011089406056422802


In [51]:
model = Ridge(alpha=0.04)
model.fit(X=X, y=y)
predY = model.predict(testX)
result_ridge = pd.DataFrame({'id':test.id, 'prediction':predY})
result_ridge.to_csv('吴加清_ridge.csv', index=False)

#### KernelRidge

In [50]:
model = KernelRidge(alpha=0.01, coef0=1.5, degree=3, kernel='polynomial')
model.fit(X=X, y=y)
predY = model.predict(testX)
result_kr = pd.DataFrame({'id':test.id, 'prediction':predY})
result_kr.to_csv('吴加清_kr.csv', index=False)

#### xgboost

In [None]:
import lightgbm as lgb

In [None]:
params = {'boosting_type':'gbdt', 'learning_rate':0.1, 'min_child_weight':1, 'max_depth':5, 
          'gamma':0.1, 'subsample':0.8, 'colsample_bytree':0.8, 'reg_alpha':0., 'reg_lambda':0.,
          'objective':'regression'}
dmat = lgb.Dataset(trainX, label=trainY)
cv = lgb.cv(params, dmat, num_boost_round=1000, nfold=5, stratified=False, early_stopping_rounds=50, verbose_eval=1, metrics='rmse')
len(cv['rmse-mean'])

In [None]:
lgbr = lgb.LGBMRegressor(learning_rate=0.1, min_child_weight=1, max_depth=5, gamma=0.1, 
                        subsample=0.8, colsample_bytree=0.8, reg_alpha=0., reg_lambda=0.,
                        objective='regression', num_leaves=32, n_estimators=63, min_child_samples=20,
                        feature_fraction=0.8)
param_grid = {'min_child_weight':[1,2,3,4,5,6,7,8,9,10]}
gridCV = GridSearchCV(lgbr, param_grid=param_grid)
gridCV.fit(trainX, trainY)
print('best parameter:', gridCV.best_params_)
print('best score:', gridCV.best_score_)

In [None]:
lgbr = lgb.LGBMRegressor(learning_rate=0.2, min_child_weight=1, max_depth=3, gamma=0.1, 
                        subsample=0.8, colsample_bytree=0.8, reg_alpha=0.0, reg_lambda=0.0,
                        objective='regression', num_leaves=32, n_estimators=63, min_child_samples=20,
                        feature_fraction=0.8)
lgbr.fit(trainX, trainY)
predY = lgbr.predict(valX)
# 评估验证集分数 1.170883722882294
np.square(predY*100 - np.array(valY.values.tolist())*100).mean()

#### NN

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation

In [None]:
model = Sequential()
model.add(Dense(32, input_shape=(5,)))
model.add(Activation('relu'))
# model.add(Dense(16))
# model.add(Activation('relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='sgd', metrics=['mse'])
H = model.fit(trainX, trainY, epochs=100, batch_size=32, validation_data=(valX, valY), verbose=1)

In [None]:
predY = model.predict(valX, batch_size=32)
np.square(predY*100 - np.array(valY.values.tolist())*100).mean()

In [None]:
H.history.keys()
plt.figure()
plt.plot(range(1,101), H.history['val_loss'], label='val_loss')
# plt.plot(range(1,101), H.history['loss'], label='loss')
# plt.plot(range(1,101), H.history['val_mean_squared_error'], label='val_error')
# plt.plot(range(1,101), H.history['mean_squared_error'], label='error')
plt.legend()
plt.show()

In [None]:
result = pd.DataFrame({'id':test.id, 'prediction':predY})
result.to_csv('吴加清.csv', index=False)
result

In [None]:
def evaluate(prediction_path,real_path):
    predict = pd.read_csv(prediction_path)
    real = pd.read_csv(real_path)
    predict = predict.loc[:,['id','prediction']]
    real = real.loc[:,['id','child_height']]
    real = real.merge(predict,on='id',how='left')
    return np.square(real.prediction*100-real.child_height*100).mean()

### 隗同学的思路
我就加了两列，把boy_dummy和father_height还有mother_height分别乘了一下。
大致想法就是既然考虑性别了，那就也可以认为不同性别对父母依赖也不一样，所以加入这两列就等于考虑了这个因素。

In [None]:
from sklearn.linear_model import Lasso
Lasso?