In [1]:
# Data Preprocessing - version 1

In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model, metrics, ensemble
from sklearn import model_selection
from xgboost import XGBClassifier
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
def read_file(filename):
    raw_data = pd.read_csv(filename)
    data = raw_data.copy()
    return data

In [3]:
# drop features
def select_same_resp_feature(X_train):
    same_resp_feature = []
    for feature in X_train.columns:
        if len(np.unique(X_train[feature])) == 1:
            same_resp_feature.append(feature)
    print(len(same_resp_feature))
    return same_resp_feature

def select_allocation_flag(X_train):
    allocation_flag_feature = []
    for feature in X_train.columns:
        if feature[:2] == 'PX' or feature[:2] == 'HX':
            #print(feature)
            allocation_flag_feature.append(feature)
    other_alloc = ['PRCITFLG', 'PRWERNAL', 'PRHERNAL']
    allocation_flag_feature = allocation_flag_feature + other_alloc
    print(len(allocation_flag_feature))
    return allocation_flag_feature

# def select_low_resp_feature(X_train):
#     for line in feature3:
#     print(np.argmax(np.bincount(line)))

# drop useless features together
def drop_features(X_train):
    same_resp_feature = select_same_resp_feature(X_train)
    allocation_flag_feature = select_allocation_flag(X_train)
    drop_features = list(set(same_resp_feature + allocation_flag_feature))
    X_train.drop(drop_features, axis=1, inplace=True)
    print(X_train.shape)
    

In [4]:
def train_data_preprocessing(filename):
    # read in the train file
    train_data = read_file(filename)
    y_train = train_data['target']
    X_train_raw = train_data.drop('target', axis=1)

    # no dropping raw data -> np array
    X_train_raw_arr = X_train_raw.values
    y_train_arr = y_train.values
    
    # drop features -> np array
    to_drop_features = select_same_resp_feature(X_train_raw)
    X_train_drop = X_train_raw.drop(to_drop_features, axis=1)
    X_train_drop_arr = X_train_drop.values
    
    return X_train_raw_arr, y_train_arr, X_train_drop_arr, to_drop_features

def test_data_preprocessing(filename, to_drop_features):
    test_data = read_file(filename)
    X_test_raw = test_data
    X_test_raw_arr = X_test_raw.values

    X_test_drop = X_test_raw.drop(to_drop_features, axis=1)
    X_test_drop_arr = X_test_drop.values
    
    return X_test_raw_arr, X_test_drop_arr

In [5]:
# raw pd.dataframe data
train_path = "train_2008.csv"
test_path = "test_2008.csv"
train_data = read_file(train_path)
test_data = read_file(test_path)

y_train = train_data['target']
X_train = train_data.drop('target', axis=1)
X_test = test_data

In [6]:
# drop same response features
same_resp_feature = select_same_resp_feature(X_train)
X_train.drop(same_resp_feature, axis=1, inplace=True)
X_test.drop(same_resp_feature, axis=1, inplace=True)
print(same_resp_feature)

15
['HRMONTH', 'HRYEAR4', 'HUTYPEA', 'HUTYPC', 'HRINTSTA', 'PEAFNOW', 'PRPERTYP', 'PULKDK4', 'PULKDK5', 'PULKDK6', 'PULKPS4', 'PULKPS5', 'PULKPS6', 'HXPHONEO', 'PXAGE']


In [7]:
# map negative response to -1
for feature in X_train.columns:
    X_train[feature] = X_train[feature].apply(lambda x: -1 if x < 0 else x)
    X_test[feature] = X_test[feature].apply(lambda x: -1 if x < 0 else x)

response_rates = X_train[X_train >= 0].count() / len(X_train)
mostly_blank_feats = []
for itm in response_rates.items():
    if itm[1] < 0.01:
        mostly_blank_feats.append(itm[0])
print(len(mostly_blank_feats))

X_train.drop(mostly_blank_feats, axis=1, inplace=True)
X_test.drop(mostly_blank_feats, axis=1, inplace=True)

58


In [8]:
# # categorical features
# categorical_features = ['HUFINAL','GEREG', 'HUBUS', 'PTDTRACE', 'PENATVTY', 'PUABSOT', 'PEIO1COW', 
#                      'HUFINAL', 'GESTCEN', 'GESTFIPS', #'PEIO1ICD', 'PEIO2ICD', 
#                      'PRCITSHP', 'PUDIS', 'PRABSREA', 'PRWKSTAT', 'HUPRSCNT', 
#                      'PERRP', 'GTCBSAST', 'PRMJOCGR', 'HRHTYPE', ]



# # Now dummy these features
# train_dummy_df = pd.DataFrame()
# test_dummy_df = pd.DataFrame()

# for feature in categorical_features:
#     train_dummy_vars = pd.get_dummies(X_train[feature], prefix=feature)
#     train_dummy_df = pd.concat([train_dummy_df, train_dummy_vars], axis=1)
    
#     test_dummy_vars = pd.get_dummies(X_test[feature], prefix=feature)
#     test_dummy_df = pd.concat([test_dummy_df, test_dummy_vars], axis=1)
# # Drop the original categorical variables
# X_train.drop(categorical_features, axis=1, inplace=True)
# X_test.drop(categorical_features, axis=1, inplace=True)

# # Add dummy vars to the data
# X_train = pd.concat([X_train, train_dummy_df], axis=1)
# X_test = pd.concat([X_test, test_dummy_df], axis=1)

# Now the train and test data have different numbers of features -> fix it!
feats_to_add_to_train = [f for f in X_test.columns if f not in X_train.columns]
feats_to_add_to_test = [f for f in X_train.columns if f not in X_test.columns]

for feat in feats_to_add_to_train:
    X_train[feat] = 0
for feat in feats_to_add_to_test:
    X_test[feat] = 0

In [9]:
X = X_train.values
Y = y_train.values
X_t = X_test.values
print(X.shape)
print(Y.shape)
print(X_t.shape)

(64667, 309)
(64667,)
(16000, 309)


In [15]:
test_size = 0.3
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=10)
clf_xgb = XGBClassifier(max_depth=9, gamma=0.2, subsample=0.9, min_child_weight=3, n_estimators=50, objective='binary:logistic') 
clf_xgb.fit(X_train, y_train)
y_train_pred = clf_xgb.predict_proba(X_train)[:, 1]
y_test_pred = clf_xgb.predict_proba(X_test)[:, 1]

train_auc = metrics.roc_auc_score(y_train, y_train_pred)
print("train auc: ", train_auc)
test_auc = metrics.roc_auc_score(y_test, y_test_pred)
print("test auc: ", test_auc)

train auc:  0.9064725967996514
test auc:  0.7916698711831637


In [16]:
y_test_pred_prob = clf_xgb.predict_proba(X_t)[:, 1]
pd.DataFrame(y_test_pred_prob).to_csv("xgb_test2008.csv")
print("done!")

done!


In [None]:
param_test = {'max_depth':range(5,13,2), 'gamma':[0, 0.1, 0.2, 0.3], 'min_child_weight':range(1,6,2)}
estimator = XGBClassifier(subsample=0.8, objective='binary:logistic', n_estimators=50)
gsearch = model_selection.GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc', cv=5)
gsearch.fit(X, Y)
print("best_params: ", gsearch1.best_params_)
print("best_score: ", gsearch1.best_score_)

In [18]:
pd.DataFrame(y_tesdt_pred).to_csv("gbdt_test2008.csv")

In [15]:
X_test = read_file("train_2008.csv")
X_test.drop(same_resp_feature, axis=1, inplace=True)
for feat in X_test.columns:
    X_test[feat] = X_test[feat].apply(lambda x: -1 if x < 0 else x)
for feat in categorical_feats:
    X_test = pd.get_dummies(X_test, columns=[feat])
print(X.shape)
print(X_test.values.shape)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [6]:
# numerical features

In [7]:
# one-hot encoding

In [8]:
# raw data for random forest

In [9]:
train_path = "train_2008.csv"
test_path = "test_2008.csv"
X_train_raw_arr, y_train_arr, X_train_drop_arr, to_drop_features = train_data_preprocessing(train_path)
X_test_raw_arr, X_test_drop_arr = test_data_preprocessing(test_path, to_drop_features)

print(X_train_raw_arr.shape)
print(y_train_arr.shape)
print(X_train_drop_arr.shape)
print(X_test_raw_arr.shape)
print(X_test_drop_arr.shape)

15
(64667, 382)
(64667,)
(64667, 367)
(16000, 382)
(16000, 367)


In [13]:
def run_test(X_train_arr, y_train_arr, num_folds, C, X_test_arr, clf):
    clf.fit(X_train_arr, y_train_arr)
    y_train_pred_prob = clf.predict_proba(X_train_arr)[:, 1]
    train_auc = metrics.roc_auc_score(y_train_arr, y_train_pred_prob)
    print("train auc: ", train_auc)
    y_test_pred_prob = clf.predict_proba(X_test_arr)[:, 1]
    return clf, train_auc, y_test_pred_prob

In [11]:
# calculate auc score for cross_validation
def cross_validation(num_folds, X_train_arr, clf):
    kf = model_selection.KFold(n_splits=num_folds)
    auc = []
    for train_index, test_index in kf.split(X_train_arr):
        print("1---")
        X_train_cv, X_test_cv = X_train_arr[train_index], X_train_arr[test_index]
        y_train_cv, y_test_cv = y_train_arr[train_index], y_train_arr[test_index]
        clf.fit(X_train_cv,y_train_cv)
        y_pred_prob = clf.predict_proba(X_test_cv)[:, 1]
        auc.append(metrics.roc_auc_score(y_test_cv, y_pred_prob))
        print("2---")
    return auc

In [14]:
# logistic regression
def logistic_reg(X_train_arr, num_folds, C):
    clf = linear_model.LogisticRegression(penalty='l1', C=C, solver='liblinear')
    #clf = linear_model.LogisticRegression()
    #auc = cross_validation(num_folds, X_train_arr, clf)
    #return auc, clf
    return clf

In [11]:
# raw data for logistic regression

# tune the hyperparameter C has little influence
# current C =  0.3 , auc =  [0.7663607035376859, 0.7721732996821118, 0.7687268216387099]
# current C =  0.475 , auc =  [0.7662418721269123, 0.7712735791351473, 0.768680422577126]
# current C =  0.6499999999999999 , auc =  [0.7661768150447318, 0.7713118201969408, 0.7683741910137993]
# current C =  0.825 , auc =  [0.7661509304142629, 0.7711936392253098, 0.7682658368117301]
# current C =  1.0 , auc =  [0.766072324965325, 0.7707366180248577, 0.7682738111247072]

# X_train_raw_arr, y_train_raw_arr, X_train_drop_arr = data_preprocessing(train_path, to_drop_features)
# X_test_raw_arr, y_test_raw_arr, X_test_drop_arr = data_preprocessing(test_path, to_drop_features)
# num_folds = 3
# C_para = np.linspace(0.3, 1, 5)
# for C in C_para:
#     auc = logistic_reg(X_train_raw_arr, num_folds, C)
#     print("current C = ", C, ", auc = ", auc)

# ??? how can I ensemble LR models? will ensemble here improves the performance?

In [16]:
# run logistic regression
num_folds = 5
C = 0.3
test_size = 0.3
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=10)
clf_lr = logistic_reg(X_train, num_folds, C)
clf, train_auc, y_test_pred_prob = run_test(X_train, y_train, num_folds, C, X_test, clf_lr)
print(train_auc)
test_auc = metrics.roc_auc_score(y_test, y_test_pred_prob)
print(test_auc)
print(clf.coef_)


#pd.DataFrame(y_test_pred_prob).to_csv("lr_test2008.csv")
#print("done!")

train auc:  0.7742189701790714
0.7742189701790714
0.7719477543415707
[[ 1.06529611e-06  1.04506420e-02  1.73326711e-03  8.31401467e-02
   3.53121147e-01  1.16660750e-01  3.59783834e-01 -4.63037852e-02
  -4.40578302e-03 -1.93829637e-02  1.16503131e-08  1.04606198e-02
  -2.29482596e-02  1.17185565e-02 -1.03245149e-01 -2.16063165e-04
  -3.72117647e-02  6.98180804e-06  1.61678073e-01 -2.37551192e-03
  -5.21249193e-02 -2.56715484e-01  1.21445642e-02  3.93426980e-03
   2.09660268e-07 -2.97096563e-04  2.42630526e-03 -1.13974347e-03
  -5.73297622e-02 -2.45993565e-02 -1.87942729e-04 -2.24134320e-02
   7.99248596e-02 -2.72194955e-02  3.96643124e-01 -1.92581188e-01
  -5.26471984e-02 -2.03686157e-01  7.52618135e-02 -9.17889663e-02
   2.66786055e-02 -1.19624563e-01 -8.95974318e-03 -1.24795072e-01
   4.00329707e-02  1.08988456e-01  1.97889454e-01 -4.53304637e-01
   1.56011740e-01 -3.26836248e-03  3.19138353e-04  3.97854855e-04
   3.00764489e-01  2.92101968e-02  3.38797897e-02  7.58800595e-02
   1.37

In [17]:
print(len(clf.coef_))

1


In [32]:
# gbdt
def gbdt(X_train_arr, num_folds):
    #clf_gbdt = ensemble.GradientBoostingClassifier(random_state=10)
    clf_gbdt = ensemble.GradientBoostingClassifier(n_estimators=50, random_state=10, min_samples_leaf=20) 
    #clf_gbdt = ensemble.GradientBoostingClassifier(learning_rate=0.5, min_samples_split=300, min_samples_leaf=20,max_depth=5,max_features='sqrt', subsample=0.8,random_state=10)
    #auc = cross_validation(num_folds, X_train_arr, clf_gbdt)
    #print("auc = ", auc)
    return clf_gbdt

In [21]:
# run gbdt
X_train_raw_arr, y_train_arr, X_train_drop_arr, to_drop_features = train_data_preprocessing(train_path)
X_test_raw_arr, X_test_drop_arr = test_data_preprocessing(test_path, to_drop_features)
num_folds = 3
clf_gbdt = gbdt(X_train_raw_arr, num_folds)
train_auc, y_test_pred_prob = run_test(X_train_raw_arr, y_train_arr, num_folds, C, X_test_raw_arr, clf_gbdt)


# pd.DataFrame(y_test_pred_prob).to_csv("gbdt_raw_test2008.csv")
# print("done!")

15
train auc:  0.8254147729384751
done!


15
train auc:  0.7929522993012791
done!


In [34]:
X_train_raw_arr, y_train_arr, X_train_drop_arr, to_drop_features = train_data_preprocessing(train_path)
X_test_raw_arr, X_test_drop_arr = test_data_preprocessing(test_path, to_drop_features)
param_test = {'min_samples_split':range(200,900,100), 'min_samples_leaf': range(30, 70, 10), 'max_depth': range(5, 10, 1)}
estimator = ensemble.GradientBoostingClassifier(
    learning_rate=0.8,
    #max_features='sqrt',
    random_state=10,
    n_estimators=50
)
gsearch = model_selection.GridSearchCV(estimator, param_grid = param_test, scoring='roc_auc', cv=5)
gsearch.fit(X_train_raw_arr, y_train_arr)
print("best_params: ", gsearch1.best_params_)
print("best_score: ", gsearch1.best_score_)

15


KeyboardInterrupt: 

In [33]:
X_train_raw_arr, y_train_arr, X_train_drop_arr, to_drop_features = train_data_preprocessing(train_path)
X_test_raw_arr, X_test_drop_arr = test_data_preprocessing(test_path, to_drop_features)
num_folds = 3
clf_gbdt = gbdt(X_train_raw_arr, num_folds)
train_auc, y_test_pred_prob = run_test(X_train_raw_arr, y_train_arr, num_folds, C, X_test_raw_arr, clf_gbdt)
pd.DataFrame(y_test_pred_prob).to_csv("gbdt_raw_test2008.csv")
print("done!")

15
train auc:  0.7818926933969366
done!
