In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from collections import Counter

config = {
    'round' : 10000,
    'random_seed' : 1218,
    'fold' : 5
}

xgb_param = {
    'booster': 'gbtree',
    'objective' : 'multi:softmax',
    'num_class' : 4,
    'early_stopping_rounds':100,

    'max_depth' : 6,
    'eta': 0.1,
    'gamma' : 0.1,
    'min_child_weight':3,

    'subsample':0.7,    

    'seed': config['random_seed'],
    'nthread': 3,
}


#read subsidy
train_subsidy = pd.read_csv('../data/train/subsidy_train.txt', header = None)
train_subsidy.columns = ['ID', 'MONEY']
test_subsidy = pd.read_csv('../data/test/studentID_test.txt', header = None)
test_subsidy.columns = ['ID']
test_subsidy['MONEY'] = np.nan
train_test = pd.concat([train_subsidy, test_subsidy])
train_test.to_csv("../data/input/train_test.csv", index = False)

#read card
train_card = pd.read_csv('../data/train/card_train.txt', header = None)
train_card.columns = ['ID', 'CARD_CAT', 'CARD_WHERE', 'CARD_HOW', 'CARD_TIME', 'CARD_SPEND', 'CARD_REMAINDER']
test_card = pd.read_csv('../data/test/card_test.txt', header = None)
test_card.columns = ['ID', 'CARD_CAT', 'CARD_WHERE', 'CARD_HOW', 'CARD_TIME', 'CARD_SPEND', 'CARD_REMAINDER']
card_train_test = pd.concat([train_card,test_card])


#process card data
card = pd.DataFrame(card_train_test.groupby(['ID'])['CARD_CAT'].count())

card['CARD_SPEND_SUM'] = card_train_test.groupby(['ID'])['CARD_SPEND'].sum()
card['CARD_SPEND_MEAN'] = card_train_test.groupby(['ID'])['CARD_SPEND'].mean()
card['CARD_SPEND_STD'] = card_train_test.groupby(['ID'])['CARD_SPEND'].max()
card['CARD_SPEND_MEDIAN'] = card_train_test.groupby(['ID'])['CARD_SPEND'].median()

card['CARD_REMAINDER_SUM'] = card_train_test.groupby(['ID'])['CARD_REMAINDER'].sum()
card['CARD_REMAINDER_MEAN'] = card_train_test.groupby(['ID'])['CARD_REMAINDER'].mean()
card['CARD_REMAINDER_STD'] = card_train_test.groupby(['ID'])['CARD_REMAINDER'].max()
card['CARD_REMAINDER_MEDIAN'] = card_train_test.groupby(['ID'])['CARD_REMAINDER'].median()

card.to_csv('../data/input/cardInfo.csv', index = True)
card = pd.read_csv('../data/input/cardInfo.csv')
train_test = pd.merge(train_test, card, how= 'left', on = 'ID')

#read score
train_score = pd.read_csv('../data/train/score_train.txt', header = None)
train_score.columns = ['ID', 'COLLEGE', 'RANK']
test_score = pd.read_csv('../data/test/score_test.txt', header = None)
test_score.columns = ['ID', 'COLLEGE', 'RANK']
train_test_score = pd.concat([train_score, test_score])

score = pd.DataFrame(train_test_score.groupby(['COLLEGE'])['RANK'].max())
score.to_csv('../data/input/collegeInfo.csv', index = True)
score = pd.read_csv('../data/input/collegeInfo.csv')
score.columns = ['COLLEGE', 'COLLEGE_STU_NUM']

train_test_score = pd.merge(train_test_score, score, how='left', on='COLLEGE')
train_test_score['SCORE'] = train_test_score['RANK'] / train_test_score['COLLEGE_STU_NUM']
train_test = pd.merge(train_test, train_test_score, how = 'left', on = 'ID')


#processing data for training
train = train_test[train_test['MONEY'].notnull()].fillna(-1)
test = train_test[train_test['MONEY'].isnull()].fillna(-1)

train_id = train.ID
test_id = test.ID

drop_columns = ['ID', 'MONEY']
train_features = train.drop(drop_columns, axis = 1)
test_features = test.drop(drop_columns, axis = 1)

train_label = train.MONEY
train_id = train.ID
test_id = test.ID

#encoding label
le = preprocessing.LabelEncoder()
train_encode_label = le.fit_transform(train_label)

dtrain = xgb.DMatrix(train_features, label = train_encode_label)
dtest = xgb.DMatrix(test_features)

#for balance
ssy0 = train[train['MONEY'] == 0]['ID'].count()
ssy1000 = train[train['MONEY'] == 1000]['ID'].count()
ssy1500 = train[train['MONEY'] == 1500]['ID'].count()
ssy2000 = train[train['MONEY'] == 2000]['ID'].count()
ssyNum = train['ID'].count()

#cv
print ('run cv: ' + 'round: ' + str(config['round']) + ' folds: ' + str(config['fold']))
res = xgb.cv(xgb_param, dtrain, config['round'], nfold = config['fold'], verbose_eval = 20)

#train
watchlist = [ (dtrain,'train')]
xgbmodel = xgb.train(xgb_param, dtrain, config['round'], watchlist, verbose_eval = 20)
pred = xgbmodel.predict(dtest)
intpred = [int(pred[i]) for i in range(len(pred))]
real_pred = le.inverse_transform(intpred)

result = pd.DataFrame(columns = ["studentid","subsidy"])
result.studentid = test_id
result.subsidy = real_pred
result.subsidy = result.subsidy.apply(lambda x:int(x))

print ('1000--'+str(len(result[result.subsidy==1000])) + ':741')
print ('1500--'+str(len(result[result.subsidy==1500])) + ':465')
print ('2000--'+str(len(result[result.subsidy==2000])) + ':354')

result.to_csv("../data/output/xgb_baseline.csv",index=False)



run cv: round: 10000 folds: 5
[0]	train-merror:0.141824+0.0024545	test-merror:0.14543+0.00930825
[100]	train-merror:0.124483+0.00332831	test-merror:0.144419+0.00875713
[200]	train-merror:0.0659624+0.00185231	test-merror:0.147083+0.00888999
[300]	train-merror:0.0183508+0.000613635	test-merror:0.149196+0.0104521
[400]	train-merror:0.0027102+0.000295977	test-merror:0.149104+0.00970771
[500]	train-merror:0.000138+8.60581e-05	test-merror:0.149564+0.00966879
[600]	train-merror:0+0	test-merror:0.149655+0.0100905
[700]	train-merror:0+0	test-merror:0.151217+0.00970269
[800]	train-merror:0+0	test-merror:0.152136+0.010153
[900]	train-merror:0+0	test-merror:0.152779+0.0101249
[1000]	train-merror:0+0	test-merror:0.152687+0.00980279
