# Data Preparation

In [3035]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3036]:
#Read in data
train_filename = 'data1/d_train_20180102.csv'
data = pd.read_csv(train_filename, sep=',', encoding='gbk')
datacopy = data.copy()

In [3037]:
col = data.columns.size
dic = dict()
d=0
for i in data.columns:
    dic[i] = "P"+str(d)
    d += 1

data.columns = ['id', 'sex', 'age', 'date'] + [dic[i] for i in data.columns[4:-1]] + ['sugar']

In [3038]:
#Change sex and date 
data.loc[data['sex'] == "男",'sex'] = 1
data.loc[data['sex'] == "女",'sex'] = 0
data.loc[data['sex'] == "??",'sex'] = np.nan
data.date = pd.to_datetime(data.date)

In [3039]:
#Fill NA values
data_incomp = data.copy()
data.loc[:,"P4":"P40"] = data.loc[:,"P4":"P40"].fillna(data.loc[:,"P4":"P40"].mean())
data.sex.fillna(-1, inplace=True)

In [3040]:
i = data[((data.P4 > 100)&(data.P5 > 100)&(data.P7 > 100)&(data.P12 > 1))].index
#data = data.drop(i)
data.describe().astype(np.int64).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,5642,2866,1655,1,1433,2870,4302,5732
sex,5642,0,0,-1,0,1,1,1
age,5642,45,12,3,35,45,54,93
P4,5642,26,12,10,21,26,27,434
P5,5642,27,20,0,16,26,28,498
P6,5642,87,22,22,74,87,95,374
P7,5642,38,36,6,19,33,38,736
P8,5642,76,3,57,74,76,78,100
P9,5642,45,2,29,44,45,47,54
P10,5642,30,3,7,29,30,32,66


# Data Modeling

In [3041]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet, ElasticNetCV, Lars, LassoCV, LinearRegression, MultiTaskElasticNet

In [3042]:
train_data_org = data.drop(['id','date'],axis = 1)
X_train, X_test, y_train, y_test = train_test_split(train_data_org.loc[:, :"P40"], 
                                                    train_data_org.loc[:, ['sugar']], test_size=0.3)

In [3043]:
X_train_m = X_train.loc[X_train['sex']==1].loc[:, :"P40"]
y_train_m = y_train.loc[X_train['sex']==1].loc[:, ['sugar']] 
X_test_m = X_test.loc[X_test['sex']==1].loc[:, :"P40"]
y_test_m = y_test.loc[X_test['sex']==1].loc[:, ['sugar']] 
X_train_f = X_train.loc[X_train['sex']==0].loc[:, :"P40"]
y_train_f = y_train.loc[X_train['sex']==0].loc[:, ['sugar']] 
X_test_f = X_test.loc[X_test['sex']==0].loc[:, :"P40"]
y_test_f = y_test.loc[X_test['sex']==0].loc[:, ['sugar']] 

In [3044]:
"""
eln_m = ElasticNet(alpha=1e-5, l1_ratio=0.2, fit_intercept=True, 
               normalize=True, precompute=False, max_iter=1e4, 
               copy_X=True, tol=1e-5, warm_start=False, 
               positive=False, random_state=None, selection='cyclic')

eln_m = ElasticNetCV(l1_ratio=0.2, eps=0.001, n_alphas=100, alphas=None, 
                     fit_intercept=True, normalize=True, precompute='auto', 
                     max_iter=10000, tol=0.0001, cv=10, copy_X=True, verbose=0, 
                     n_jobs=1, positive=False, random_state=None, selection='cyclic')
"""
"""
eln_m = LinearRegression(fit_intercept=True, 
                            normalize=True, 
                            copy_X=True, 
                            n_jobs=1)
"""
eln_m = ElasticNet(alpha=1e-5, l1_ratio=0.2, fit_intercept=True, 
               normalize=True, precompute=False, max_iter=1e4, 
               copy_X=True, tol=1e-5, warm_start=False, 
               positive=False, random_state=None, selection='cyclic')

eln_f = ElasticNet(alpha=1e-5, l1_ratio=0.2, fit_intercept=True, 
               normalize=True, precompute=False, max_iter=1e4, 
               copy_X=True, tol=1e-5, warm_start=False, 
               positive=False, random_state=None, selection='cyclic')

In [3045]:
eln_m.fit(X=X_train_m, y=y_train_m.loc[:, 'sugar'])
eln_f.fit(X=X_train_f, y=y_train_f.loc[:, 'sugar'])
y_test_m_pred = eln_m.predict(X_test_m)
y_test_f_pred = eln_f.predict(X_test_f)
print(0.5*mean_squared_error(y_test_m, y_test_m_pred))
print(0.5*mean_squared_error(y_test_f, y_test_f_pred))

1.06660813261
0.479078629074


In [3046]:
# test model
test_filename = 'data1/d_test_A_20180102.csv'
data_test = pd.read_csv(test_filename, sep=',', encoding='gbk')
data_test.columns = ['id', 'sex', 'age', 'date'] + [dic[i] for i in data_test.columns[4:]]
data_test.loc[data_test['sex'] == "男",'sex'] = 1
data_test.loc[data_test['sex'] == "女",'sex'] = 0
data_test = data_test.drop(['id','date'],axis = 1)
data_test.loc[:,"P4":"P40"] = data_test.loc[:,"P4":"P40"].fillna(data_test.loc[:,"P4":"P40"].mean())
data_test.sex.fillna(-1, inplace=True)

In [3047]:
data_test.loc[data_test['sex'] == 1,'sugar'] = eln_m.predict(data_test.loc[data_test['sex']==1].loc[:,:"P40"])
data_test.loc[data_test['sex'] == 0,'sugar'] = eln_f.predict(data_test.loc[data_test['sex']==0].loc[:,:"P40"])

In [3048]:
pd.DataFrame(data_test_sugar).to_csv("predict.csv", sep=',')