In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

In [5]:
from sklearn.model_selection import KFold,train_test_split
import logging
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn import metrics
class experiment_protocal():
    def __init__(self,params_dict,model,exp_name,comment):
        self.parmas_dict=params_dict
        self.model=model
        self.exp_name=exp_name
        
        self.logger = logging.getLogger(__name__)
        
        self.logger.setLevel(level = logging.INFO)
        handler = logging.FileHandler('./log/'+self.exp_name+'.txt')
        handler.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.info(comment)
        self.logger.info("Start print log")
    def cv_train_test_split(self,X,y,cv_num,seed=5):
        kf=KFold(n_splits=cv_num,shuffle=True,random_state=seed)
        k=0
        return kf.split(X,y)
    def select_best_model(self,X_train,y_train,X_val,y_val,seed=5,printFlag=False):
        result=[]
        for params in self.parmas_dict:
            if self.model=='RF':
                clf = RandomForestClassifier(n_estimators=params.get('n_estimators',100),random_state=0,class_weight='balanced',max_depth=params.get('max_depth',10),\
                    min_samples_leaf=params.get('min_sample_leaf',1),n_jobs=-1)
                
                
                clf.fit(X_train,y_train)
                train_auc=metrics.roc_auc_score(y_train,clf.predict_proba(X_train)[:,1])
                y_prob=clf.predict_proba(X_val)
                y_prob=y_prob[:,1]
                auc=metrics.roc_auc_score(y_val,y_prob)
                if printFlag:
                    print(params)
                    print('val auc:{},train auc:{}'.format(auc,train_auc))
                result.append(auc)
            elif self.model=='XGBoost':
                
                #print("TRAIN:", train_index, "TEST:", test_index)
                Dtrain=xgboost.DMatrix(X_train,y_train)
                Dval=xgboost.DMatrix(X_val,y_val)
                model=xgboost.train(params,Dtrain,num_boost_round=2000,early_stopping_rounds=100,evals=[ (Dtrain, 'train'),(Dval, 'eval')],verbose_eval=200)
                y_prob=model.predict(Dval)
                auc=metrics.roc_auc_score(y_val,y_prob)
                result.append(auc)
        return self.parmas_dict[np.argmax(result)],max(result)
    def run_model(self,X_train,y_train,X_test,y_test,params):
        if self.model=='RF':
            clf = RandomForestClassifier(n_estimators=params.get('n_estimators',100),random_state=0,class_weight='balanced',max_depth=params.get('max_depth',10),\
                    min_samples_leaf=params.get('min_sample_leaf',1),verbose=200)
            clf.fit(X_train,y_train)
            y_prob=clf.predict_proba(X_test)
            y_prob=y_prob[:,1]
            testauc=(metrics.roc_auc_score(y_test,y_prob))
            y_prob=clf.predict_proba(X_train)
            y_prob=y_prob[:,1]
            trainauc=(metrics.roc_auc_score(y_train,y_prob))
            return trainauc,testauc
        elif self.model=='XGBoost':
            Dtrain=xgboost.DMatrix(X_train,y_train)
            Dval=xgboost.DMatrix(X_test,y_test)
            model=xgboost.train(params,Dtrain,num_boost_round=2000,early_stopping_rounds=100,evals=[ (Dtrain, 'train'),(Dval, 'eval')],verbose_eval=200)
            y_prob=model.predict(Dval)
            testauc=metrics.roc_auc_score(y_test,y_prob)
            y_prob=model.predict(Dtrain)
            trainauc=metrics.roc_auc_score(y_train,y_prob)
            return trainauc,testauc
    def run(self,X,y,cv_num):
        split=self.cv_train_test_split(X,y,cv_num)
        test_re=[]
        val_re=[]
        k=0
        for train_id,test_id in split:
            print('train test split {}'.format(k))
            self.logger.info('train test split {}'.format(k))
            train_X,test_X,train_y,test_y=X[train_id],X[test_id],y[train_id],y[test_id]
            best_params,best_auc=self.select_best_model(train_X,train_y,cv_num=4)
            print('best params:{},bets auc on val set:{}'.format(best_params,best_auc))
            self.logger.info('best params:{},bets auc on val set:{}'.format(best_params,best_auc))
            trainauc,testauc=self.run_model(train_X,train_y,test_X,test_y,best_params)
            print('test auc:{}'.format(testauc))
            self.logger.info('test auc:{}'.format(testauc))
            test_re.append(testauc)
            val_re.append(best_auc)
            k+=1
        self.logger.info('validation set result:{}'.format(val_re))
        self.logger.info('test set result:{}'.format(test_re))
        logging.shutdown(self.logger)
        return test_re,val_re
    def simple_run(self,X,y,test_size=0.2,val_zise=0.2,random_state=5):
        """
        return best params, bset validation auc, best test auc
        """
        self.logger.info('simple run with fixed train test split, test size:{}'.format(test_size))
        train_X,test_X,train_y,test_y=train_test_split(X,y,stratify=y,test_size=test_size,random_state=random_state)
        train_X,val_X,train_y,val_y=train_test_split(train_X,train_y,stratify=train_y,random_state=random_state,test_size=val_zise/(1-test_size))
        best_params,best_auc=self.select_best_model(train_X,train_y,val_X,val_y,printFlag=True)
        print('best params:{},bets auc on val set:{}'.format(best_params,best_auc))
        self.logger.info('best params:{},bets auc on val set:{}'.format(best_params,best_auc))
        trainauc,testauc=self.run_model(train_X,train_y,test_X,test_y,best_params)
        print('test auc:{}'.format(testauc))
        self.logger.info('test auc:{}'.format(testauc))
        #logging.shutdown(self.logger)
        return best_params,best_auc,testauc

In [3]:

#data=pd.read_csv('./data_preprocessing/RT_SC_B_features.csv')
financial_featrues=data
X=financial_featrues.drop(['year','Stkcd','IndustryCode','label','EquityNatureID'],axis=1)
for column in list(X.columns[X.isna().sum() > 0]):
    mean_val = X[column].mean()
    X[column].fillna(mean_val, inplace=True)

X=X.values
from sklearn.preprocessing import minmax_scale
X=minmax_scale(X)
y=financial_featrues['label'].values
###baseline


In [None]:
import itertools
params_dict=[]
ne=list(range(5000,5010,250))
max_depth=[None]
min_leaf=[1]
iters=itertools.product(ne,max_depth,min_leaf)
for comb in iters:
    params_dict.append({'n_estimators':comb[0],'max_depth':comb[1],'min_sample_leaf':comb[2]})
print('total len of params comb:{}'.format(len(params_dict)))
exp=experiment_protocal(params_dict=params_dict,model='RF',exp_name='baselian_RF_KG',comment='rpt RF baseline')
re2=exp.simple_run(X,y)  

RF:
best params:{'n_estimators': 2500, 'max_depth': None, 'min_sample_leaf': 1},bets auc on val set:0.7293685650514796
test auc:0.727

NN:
best params[0.001, [1.0, 4.0], 0.01, 0.2],best epoch:1343,best val auc:0.6970900725703173,test auc: 0.694693088494896

XGB:
best params:{'max_depth': 10, 'lambda': 10},bets auc on val set:0.7142414860681114,test auc:0.6998954255890706

KG+RF：
best params:{'n_estimators': 3000, 'max_depth': None, 'min_sample_leaf': 1},bets auc on val set:0.7148270813833488
test auc:0.7131447156343542

KG+XGB
{'max_depth': 10, 'lambda': 10, 'eval_metric': 'auc', 'eta': 0.03, 'gamma': 0.1}
val auc:0.721 ;test auc:0.717

In [None]:
import itertools
params_dict=[]
max_depth=[10,20,6]
gamma=[1,0.1]
lambda1=[10]
eta=[0.01,0.03,0.3,0.1]
iters=itertools.product(max_depth,lambda1,gamma,eta)
for comb in iters:
    params_dict.append({'max_depth':comb[0],'lambda':comb[1],'eval_metric': 'auc','eta':comb[3],'gamma':comb[2]})
result=[]
train_X,test_X,train_y,test_y=train_test_split(X,y,stratify=y,test_size=0.2,random_state=5)
train_X,val_X,train_y,val_y=train_test_split(train_X,train_y,stratify=train_y,random_state=5,test_size=0.25)
Dtrain=xgboost.DMatrix(train_X,train_y)
Dval=xgboost.DMatrix(val_X,val_y)
Dtest=xgboost.DMatrix(test_X,test_y)
for params in params_dict:
    print(params)
    model=xgboost.train(params,Dtrain,num_boost_round=2000,early_stopping_rounds=100,evals=[ (Dtrain, 'train'),(Dval, 'eval')],verbose_eval=200)
    y_prob=model.predict(Dval)
    auc=metrics.roc_auc_score(val_y,y_prob)
    testauc=metrics.roc_auc_score(test_y,model.predict(Dtest))
    result.append([auc,testauc])
    print('val auc',auc)

In [1]:
import pandas as pd
import numpy as np
import csv
node_embedding=np.load('./data_preprocessing/dglke_result/ComplEx_KG4_1/KG4_ComplEx_entity.npy')
node_dict=pd.read_csv('./data_preprocessing/dglke_dataset/KG4/entities.tsv',header=None,sep='\t',quoting=csv.QUOTE_NONE,index_col=1).to_dict()[0]
data=pd.read_csv('./data_preprocessing/RT_SC_B_features.csv')
from tqdm import tqdm
embed_dim=node_embedding.shape[1]
new_col=list(range(0,embed_dim))
for i in tqdm(data.index):
    node=str(data.loc[i,'year'])+'.comp.'+ str(data.loc[i,'Stkcd'])
    data.loc[i,new_col]=node_embedding[node_dict[node],:]

100%|██████████| 9366/9366 [05:07<00:00, 30.44it/s]


In [40]:
from torch import nn 
from torch import tensor
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
class nn_model(nn.Module):
    def __init__(self, input_dim=343,embed_dim=1024):
        super(nn_model, self).__init__()
        self.mlp1=nn.Linear(input_dim,embed_dim)
        self.mlp2=nn.Linear(embed_dim,embed_dim)
        self.output=nn.Linear(embed_dim,2)
        
        self.layer_norm=nn.LayerNorm(normalized_shape= embed_dim)
    def forward(self, feature,drop_out=0.0):
        self.dropout = nn.Dropout(p=drop_out)
        h1 = torch.sigmoid(self.dropout(self.mlp1(feature)))
        h2 = torch.sigmoid(self.dropout(self.mlp2(h1)))
        logits=self.output(self.layer_norm(h2))
        return F.softmax(logits,dim=1)
    
class myNN:

    def __init__(self,input_size,hidden_sizes,output_size=2,weight=[1.0,1.0],dropout=0.2,seed=555):
        # Hyperparameters for our network
        flag = torch.cuda.is_available()
        #print(flag)
        ngpu= 1
        # Decide which device we want to run on
        self.device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")
        # Build a feed-forward network
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        self.input_size=input_size
        self.drop=dropout
        self.loss_func=nn.CrossEntropyLoss(tensor(weight,dtype=torch.float32).to(self.device))
    def train(self,X,y,epoch=1000,lr=0.1,weight_decay=0.0,verbose=False,early_stopping=True):
        self.model=nn_model(input_dim=self.input_size,embed_dim=1000).to(self.device)
        optimizer = torch.optim.Adam(self.model.parameters(),lr = lr,weight_decay=weight_decay)
        if early_stopping:
            X_train,X_val,y_train,y_val=train_test_split(X,y,stratify=y,test_size=0.3)
            X_train,X_val,y_train,y_val=torch.tensor(X_train,dtype=torch.float32).to(self.device),torch.tensor(X_val,dtype=torch.float32).to(self.device),\
                torch.tensor(y_train,dtype=torch.long).to(self.device),torch.tensor(y_val,dtype=torch.long).to(self.device)
        else:
            X_train,y_train=X,y
            X_train,y_train=torch.tensor(X_train,dtype=torch.float32).to(self.device),torch.tensor(y_train,dtype=torch.long).to(self.device)
        if verbose==False:
            printFlag=False 
        else:
            printFlag=True
            
        notIncEpoch=0
        maxAuc=0
        train_auc_list=[]
        val_auc_list=[]
        for e in range(epoch):
            self.model.train()
            precition=self.model.forward(X_train,drop_out=self.drop)
            precition=precition.to(torch.float32)
            loss=self.loss_func(precition,y_train)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if early_stopping:
                val_auc,_=self._test(X_val,y_val)
                val_auc_list.append(val_auc)
                if val_auc>=maxAuc:
                    maxAuc=val_auc
                    notIncEpoch=0
                else:
                    notIncEpoch+=1
                if notIncEpoch>400:
                    break
            train_auc,_=self._test(X_train,y_train)
            train_auc_list.append(train_auc)
            if printFlag and e%verbose==0:
                if early_stopping:
                    print('========epoch{}========'.format(e))
                    print('val auc:{}'.format(val_auc.item()))
                print('training loss:{},train auc:{}'.format(loss.item(),train_auc.item()))
        if early_stopping:
            return train_auc_list,val_auc_list,np.argmax(val_auc_list)+1,max(val_auc_list)
        return train_auc_list
    def predict(self,X):
        X=torch.tensor(X,dtype=torch.float32)
        logits=self.model.forward(X)
        return logits
    def test(self,X,y):
        X=torch.tensor(X,dtype=torch.float32).to(self.device)
        y=torch.tensor(y,dtype=torch.long).to(self.device)
        self.model.eval()
        with torch.no_grad():
            logits = self.model.forward(X)
            predict_y = logits[:,1]
            auc = roc_auc_score(y.cpu(),predict_y.cpu())
        return auc, predict_y.cpu().numpy()

    def _test(self,X,y):
        self.model.eval()
        with torch.no_grad():
            logits = self.model.forward(X)
            predict_y = logits[:,1]
            auc = roc_auc_score(y.cpu().numpy(),predict_y.cpu().numpy())
        return auc, predict_y.cpu().numpy()
import pandas as pd
#data=pd.read_csv('./new/RT_SC_B_features.csv')
financial_featrues=data
X=financial_featrues.drop(['year','Stkcd','IndustryCode','label','EquityNatureID'],axis=1)
for column in list(X.columns[X.isna().sum() > 0]):
    mean_val = X[column].mean()
    X[column].fillna(mean_val, inplace=True)

X=X.values
from sklearn.preprocessing import minmax_scale
X=minmax_scale(X)
y=financial_featrues['label'].values

In [42]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=5)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.25,stratify=y_train,random_state=5)
parmas=[]
leanring_rate=[0.01,0.001]
l2=[0.001,0.01,0]
weight=[[1.0,2.0],[1.0,6.0]]
drop=[0.2,0.4,0.6]
for l in l2:
    for w in weight:
        for lr in leanring_rate:
            for d in drop:
                parmas.append([lr,w,l,d])
re=[]
for p in tqdm(parmas):
    model=myNN(input_size=493,hidden_sizes=[1000,1000],weight=p[1],dropout=p[3])
    train_auc_list,val_auc_list,best_e,max_val_auc=model.train(X_train,y_train,epoch=2000,lr=p[0],weight_decay=p[2],verbose=False,early_stopping=True)
    re.append([best_e,max_val_auc])
best_id=np.argmax([i[1] for i in re])
print('best params{},best epoch:{},best val auc:{}'.format(parmas[best_id],re[best_id][0],re[best_id][1]))
print('testing.....')


100%|██████████| 36/36 [19:08<00:00, 31.91s/it]

best params[0.001, [1.0, 6.0], 0, 0.6],best epoch:665,best val auc:0.6840019209789878
testing.....





In [49]:
model=myNN(input_size=493,hidden_sizes=[1500,1500],weight=[1.0,6.0],dropout=0.6)
train_auc_list,val_auc_list,best_e,max_val_auc=model.train(X_train,y_train,epoch=2000,lr=0.0001,weight_decay=0,verbose=100,early_stopping=True)
print('best epoch:{},best val auc:{}'.format(best_e,max_val_auc))
# testauc,_=model.test(X_test,y_test)
# print('test auc:',testauc)

val auc:0.52460957880757
training loss:0.710503101348877,train auc:0.4925925925925926
val auc:0.6312580040791158
training loss:0.6923050284385681,train auc:0.6324169586621693
val auc:0.6350050988948441
training loss:0.6715267300605774,train auc:0.6420211546648329
val auc:0.6473699188919982
training loss:0.6496045589447021,train auc:0.6780910589339708
val auc:0.6599392875776693
training loss:0.6365774869918823,train auc:0.7156403161192433
val auc:0.6694937864630272
training loss:0.6215733289718628,train auc:0.7404724869284256
val auc:0.6767448892472608
training loss:0.6101200580596924,train auc:0.7531824780866926
val auc:0.6802340748470332
training loss:0.6070148348808289,train auc:0.7597668402266105
val auc:0.6825078262106911
training loss:0.5970363020896912,train auc:0.763046468218882
val auc:0.6833111985960253
training loss:0.6035490036010742,train auc:0.7659478665225791
val auc:0.6837039913674525
training loss:0.5882045030593872,train auc:0.7682164259750467
val auc:0.683735118341791

In [33]:
re[best_id]

[612, 0.69671061993075]