In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
violation=pd.read_excel('./STK_Violation_Main.xlsx')
financial_featrues=pd.read_excel('./financial_features.xlsx')

In [None]:
financial_featrues=financial_featrues.loc[financial_featrues['year']>=2003]
violation=violation.loc[violation['IsViolated']=='Y']

In [None]:
for i in tqdm(financial_featrues.index):
    code=financial_featrues.loc[i,'Stkcd']
    year=financial_featrues.loc[i,'year']
    tmp=violation.loc[violation['Stkcd']==code]
    tmp['year']=tmp['year'].apply(lambda x:str(x))
    if len(tmp)==0:
        financial_featrues.loc[i,'label']=0
    else:
        financial_featrues.loc[i,'label']=0
        for j in tmp.index:
            if str(year) in tmp.loc[j,'year'].split(','):
                financial_featrues.loc[i,'label']=1


In [None]:
financial_featrues['IndustryCode'].isna().sum()

In [None]:
financial_featrues=financial_featrues.loc[financial_featrues['year']<=2020]

In [None]:
financial_featrues['IndustryCode'].isna().sum()

In [None]:
financial_featrues.to_csv('financial_features.csv',index=False)

In [None]:

a=financial_featrues.isna().sum(axis=1)
delete_row=a.index[a.apply(lambda x:x>0.2*349)]


In [None]:
financial_featrues.drop(delete_row,inplace=True)

In [None]:

a=financial_featrues.isna().sum(axis=0)
delete_col=a.index[a.apply(lambda x:x>0.2*26710)]
financial_featrues.drop(delete_col,inplace=True,axis=1)
financial_featrues.to_csv('financial_features.csv',index=False)

In [12]:
from sklearn.model_selection import KFold,train_test_split
import logging
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn import metrics
class experiment_protocal():
    def __init__(self,params_dict,model,exp_name,comment):
        self.parmas_dict=params_dict
        self.model=model
        self.exp_name=exp_name
        
        self.logger = logging.getLogger(__name__)
        
        self.logger.setLevel(level = logging.INFO)
        handler = logging.FileHandler('./log/'+self.exp_name+'.txt')
        handler.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.info(comment)
        self.logger.info("Start print log")
    def cv_train_test_split(self,X,y,cv_num,seed=5):
        kf=KFold(n_splits=cv_num,shuffle=True,random_state=seed)
        k=0
        return kf.split(X,y)
    def select_best_model(self,X,y,cv_num=5,seed=5,printFlag=False):
        result=[]
        k=0
        for params in self.parmas_dict:
            if self.model=='RF':
                clf = RandomForestClassifier(n_estimators=params.get('n_estimators',100),random_state=0,class_weight='balanced',max_depth=params.get('max_depth',10),\
                    min_samples_leaf=params.get('min_sample_leaf',1),n_jobs=-1)
                
                X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
                clf.fit(X_train,y_train)
                train_auc=metrics.roc_auc_score(y_train,clf.predict_proba(X_train)[:,1])
                y_prob=clf.predict_proba(X_test)
                y_prob=y_prob[:,1]
                auc=metrics.roc_auc_score(y_test,y_prob)
                if printFlag:
                    print(params)
                    print('val auc:{},train auc:{}'.format(auc,train_auc))
                result.append(auc)
            elif self.model=='XGBoost':
                kf=KFold(n_splits=cv_num,random_state=seed,shuffle=True)
                auc=[]
                for train_index, test_index in kf.split(X,y):
                    #print("TRAIN:", train_index, "TEST:", test_index)
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    Dtrain=xgboost.DMatrix(X_train,y_train)
                    Dval=xgboost.DMatrix(X_test,y_test)
                    model=xgboost.train(params,Dtrain,num_boost_round=2000,early_stopping_rounds=100,evals=[ (Dtrain, 'train'),(Dval, 'eval')],verbose_eval=False)
                    y_prob=model.predict(Dval)
                    auc.append(metrics.roc_auc_score(y_test,y_prob))
                result.append(np.mean(auc))
        return self.parmas_dict[np.argmax(result)],max(result)
    def run_model(self,X_train,y_train,X_test,y_test,params):
        if self.model=='RF':
            clf = RandomForestClassifier(n_estimators=params.get('n_estimators',100),random_state=0,class_weight='balanced',max_depth=params.get('max_depth',10),\
                    min_samples_leaf=params.get('min_sample_leaf',1))
            clf.fit(X_train,y_train)
            y_prob=clf.predict_proba(X_test)
            y_prob=y_prob[:,1]
            testauc=(metrics.roc_auc_score(y_test,y_prob))
            y_prob=clf.predict_proba(X_train)
            y_prob=y_prob[:,1]
            trainauc=(metrics.roc_auc_score(y_train,y_prob))
            return trainauc,testauc
        elif self.model=='XGBoost':
            Dtrain=xgboost.DMatrix(X_train,y_train)
            Dval=xgboost.DMatrix(X_test,y_test)
            model=xgboost.train(params,Dtrain,num_boost_round=2000,early_stopping_rounds=100,evals=[ (Dtrain, 'train'),(Dval, 'eval')],verbose_eval=False)
            y_prob=model.predict(Dval)
            testauc=metrics.roc_auc_score(y_test,y_prob)
            y_prob=model.predict(Dtrain)
            trainauc=metrics.roc_auc_score(y_train,y_prob)
            return trainauc,testauc
    def run(self,X,y,cv_num):
        split=self.cv_train_test_split(X,y,cv_num)
        test_re=[]
        val_re=[]
        k=0
        for train_id,test_id in split:
            print('train test split {}'.format(k))
            self.logger.info('train test split {}'.format(k))
            train_X,test_X,train_y,test_y=X[train_id],X[test_id],y[train_id],y[test_id]
            best_params,best_auc=self.select_best_model(train_X,train_y,cv_num=4)
            print('best params:{},bets auc on val set:{}'.format(best_params,best_auc))
            self.logger.info('best params:{},bets auc on val set:{}'.format(best_params,best_auc))
            trainauc,testauc=self.run_model(train_X,train_y,test_X,test_y,best_params)
            print('test auc:{}'.format(testauc))
            self.logger.info('test auc:{}'.format(testauc))
            test_re.append(testauc)
            val_re.append(best_auc)
            k+=1
        self.logger.info('validation set result:{}'.format(val_re))
        self.logger.info('test set result:{}'.format(test_re))
        logging.shutdown(self.logger)
        return test_re,val_re
    def simple_run(self,X,y,test_size=0.2,random_state=5):
        """
        return best params, bset validation auc, best test auc
        """
        self.logger.info('simple run with fixed train test split, test size:{}'.format(test_size))
        train_X,test_X,train_y,test_y=train_test_split(X,y,stratify=y,test_size=test_size,random_state=random_state)
        best_params,best_auc=self.select_best_model(train_X,train_y,printFlag=True)
        print('best params:{},bets auc on val set:{}'.format(best_params,best_auc))
        self.logger.info('best params:{},bets auc on val set:{}'.format(best_params,best_auc))
        trainauc,testauc=self.run_model(train_X,train_y,test_X,test_y,best_params)
        print('test auc:{}'.format(testauc))
        self.logger.info('test auc:{}'.format(testauc))
        logging.shutdown(self.logger)
        return best_params,best_auc,testauc

In [None]:
financial_featrues=pd.read_csv('./financial_features.csv')
X=financial_featrues.drop(['year','Stkcd','IndustryCode','label','EquityNatureID'],axis=1)
for column in list(X.columns[X.isna().sum() > 0]):
    mean_val = X[column].mean()
    X[column].fillna(mean_val, inplace=True)

X=X.values
y=financial_featrues['label'].values

In [None]:
import itertools
params_dict=[]
ne=list(range(2500,5010,250))
max_depth=[20,None]
min_leaf=[1,5,10]
iters=itertools.product(ne,max_depth,min_leaf)
for comb in iters:
    params_dict.append({'n_estimators':comb[0],'max_depth':comb[1],'min_sample_leaf':comb[2]})
print('total len of params comb:{}'.format(len(params_dict)))
exp=experiment_protocal(params_dict=params_dict,model='RF',exp_name='baselian_RF',comment='rpt RF baseline')
re2=exp.run(X,y,5)

baseline RF:

test auc:
([0.7813094957031855,
  0.7592270507665895,
  0.7555582320020272,
  0.7699949780189731,
  0.7611411958097487]
  mean:0.765

  val auc
 [0.7658625063984155,
  0.7737437110429671,
  0.7686943799765285,
  0.7656989257542322,
  0.7639862635362276])

In [None]:
np.mean(re2[0])

In [3]:
rpt=pd.read_csv('listed_RPT.csv')
financial_featrues=pd.read_csv('financial_features.csv')


In [5]:
missing=0
select_index=[]
for i in tqdm(rpt.index):
    year=rpt.loc[i,'tradeYear']
    cp1=rpt.loc[i,'cp1']
    cp2=rpt.loc[i,'cp2']
    ind1=financial_featrues.query('Stkcd == @cp1 & year == @year')
    ind2=financial_featrues.query('Stkcd == @cp2 & year == @year')
    if len(ind1)==0 or len(ind2)==0:
        missing+=1
        continue
    select_index+=[ind1.index.values[0],ind2.index.values[0]]

100%|██████████| 28398/28398 [04:48<00:00, 98.60it/s] 


In [7]:
rpt_financial_varibles=financial_featrues.loc[list(set(select_index))]
rpt_financial_varibles.to_csv('rpt_fv.csv',index=False)

In [4]:
rpt_financial_varibles=pd.read_csv('rpt_fv.csv')

In [15]:
### baseline for rpt dataset
data=pd.read_csv('./RT_SC_B_features.csv')
financial_featrues=data
X=financial_featrues.drop(['year','Stkcd','IndustryCode','label','EquityNatureID'],axis=1)
for column in list(X.columns[X.isna().sum() > 0]):
    mean_val = X[column].mean()
    X[column].fillna(mean_val, inplace=True)
X=X.values
y=financial_featrues['label'].values

In [None]:
import itertools
params_dict=[]
ne=list(range(2500,5010,250))
max_depth=[20,None]
min_leaf=[1,5,10]
iters=itertools.product(ne,max_depth,min_leaf)
for comb in iters:
    params_dict.append({'n_estimators':comb[0],'max_depth':comb[1],'min_sample_leaf':comb[2]})
print('total len of params comb:{}'.format(len(params_dict)))
exp=experiment_protocal(params_dict=params_dict,model='RF',exp_name='rpt_baseline',comment='baseline with rpt related dataset')
re=exp.simple_run(X,y)

In [5]:
triples={}
###增加 关联交易（RPT） 关系
for i in rpt.index:
    
    year=str(rpt.loc[i,'tradeYear'])
    t=tuple([year+'.comp.'+str(rpt.loc[i,'cp1']),'relatedTransaction',year+'.comp.'+str(rpt.loc[i,'cp2'])])
    triples[t]=1
from tqdm import tqdm
tmpT=np.array([list(i) for i in list(triples.keys())])
X=[list(i) for i in list(triples.keys())]
###增加 the same company 关系
for i in tqdm(range(len(tmpT))):
    c1=tmpT[i][0]
    c1_n_set=[]
    for j in range(len(tmpT)):
        
        c2=tmpT[j][0]
        c3=tmpT[j][2]
        if c2!=c1 and c1[4:]==c2[4:]:
            t=int(c2[0:4])
            if t not in c1_n_set and t<int(c1[0:4]):
                
                c1_n_set.append(t)
        if c3!=c1 and c1[4:]==c3[4:]:
            t=int(c3[0:4])
            if t not in c1_n_set and t<int(c1[0:4]):
                c1_n_set.append(t)
    if len(c1_n_set)!=0:
        t=[c1,'year.previous',str(max(c1_n_set))+c1[4:]]
        if t not in X:
            X.append(t)


100%|██████████| 12048/12048 [03:40<00:00, 54.54it/s]


In [30]:
pd.DataFrame(X).to_csv('KG3.csv',index=False)

In [44]:
###增加分类类别关系
industry_triples=[]
for i in tqdm(rpt_financial_varibles.index):
    year=rpt_financial_varibles.loc[i,'year']
    code=rpt_financial_varibles.loc[i,'Stkcd']
    ind='ind.'+str(rpt_financial_varibles.loc[i,'IndustryCode'])
    industry_triples.append([str(year)+'.comp.'+str(code),'industry',ind])
X+=industry_triples
# categories=['Big4','Outside','OneControlMany','MngmFinancialBack','MngmOverseaBack',
# 'IsCocurP','ConcurrentPosition','ISHoldOtherFinaShares','ISHoldOtherFinaShares','ISHoldBankShares','ContrshrNature','PropertyRightsNature']
# category_triples=[]
# for i in tqdm(rpt_financial_varibles.index):
#     year=rpt_financial_varibles.loc[i,'year']
#     code=rpt_financial_varibles.loc[i,'Stkcd']
#     for c in categories:
#         c_v=rpt_financial_varibles.loc[i,c]
#         category_triples.append([str(year)+'.comp.'+str(code),'cat.'+c,c+'.'+str(c_v)])
# X+=category_triples

100%|██████████| 9366/9366 [00:00<00:00, 40300.68it/s]


In [29]:
pd.DataFrame(X).to_csv('KG0.csv',index=False)

In [10]:
from ampligraph.latent_features import ComplEx
rpt_financial_varibles.drop(['IndustryCode','EquityNatureID'],axis=1,inplace=True)
for column in list(rpt_financial_varibles.columns[rpt_financial_varibles.isna().sum() > 0]):
    mean_val = rpt_financial_varibles[column].mean()
    rpt_financial_varibles[column].fillna(mean_val, inplace=True)
print('start KGE')
model =ComplEx(seed=555, epochs=1000, k=15, optimizer_params={'lr':0.01},batches_count=5)
model.fit(np.array(X))
print('finished KGE')
def get_embedding(model,cwwb,with_row_feature=True):
    if with_row_feature:
        newCWWB=cwwb.copy()
    else:
        newCWWB=pd.DataFrame()
    for i in cwwb.index:
        try:
            embedding=model.get_embeddings(str(cwwb.loc[i,'year'])+'company'+str(cwwb.loc[i,'Stkcd']))
            j=0
            for e in embedding:
                newCWWB.loc[i,j]=e
                j+=1
        except:
            print('no embedding for:',cwwb.loc[i,'Stkcd'])
    
    for j in range(0,len(embedding)):
        newCWWB[j]=newCWWB[j].fillna(np.mean(newCWWB[j]))
    newCWWB['label']=cwwb['label']
    if with_row_feature:
        FraudData=newCWWB.drop(['label','year','Stkcd'],axis=1).values
    else:
        FraudData=newCWWB.drop(['label'],axis=1).values
    FraudLabl=newCWWB.loc[:,'label'].values
    return FraudData,FraudLabl
FraudData,FraudLabel=get_embedding(model,rpt_financial_varibles,with_row_feature=True)
print('finish get embedding')

start KGE
finished KGE
finish get embedding


In [11]:
import itertools
params_dict=[]
ne=list(range(2500,5010,250))
max_depth=[20,None]
min_leaf=[1,5,10]
iters=itertools.product(ne,max_depth,min_leaf)
for comb in iters:
    params_dict.append({'n_estimators':comb[0],'max_depth':comb[1],'min_sample_leaf':comb[2]})
print('total len of params comb:{}'.format(len(params_dict)))
exp=experiment_protocal(params_dict=params_dict,model='RF',exp_name='RF_simpleKGE',comment='RF with simple KGE, KGEepoch=1000,KDdim=15')
re2=exp.run(FraudData,FraudLabel,5)

total len of params comb:66
train test split 0


100%|██████████| 66/66 [22:02<00:00, 20.03s/it]


best params:{'n_estimators': 2500, 'max_depth': None, 'min_sample_leaf': 1},bets auc on val set:0.7705323708352608
test auc:0.7607116430620643
train test split 1


100%|██████████| 66/66 [21:37<00:00, 19.66s/it]


best params:{'n_estimators': 3750, 'max_depth': None, 'min_sample_leaf': 5},bets auc on val set:0.765989758500598
test auc:0.7398635590599876
train test split 2


100%|██████████| 66/66 [22:02<00:00, 20.03s/it]


best params:{'n_estimators': 3000, 'max_depth': None, 'min_sample_leaf': 5},bets auc on val set:0.7716563997262149
test auc:0.7705160916732264
train test split 3


100%|██████████| 66/66 [21:07<00:00, 19.21s/it]


best params:{'n_estimators': 2500, 'max_depth': 20, 'min_sample_leaf': 1},bets auc on val set:0.7652856416957028
test auc:0.7664211882787473
train test split 4


100%|██████████| 66/66 [21:46<00:00, 19.80s/it]


best params:{'n_estimators': 4500, 'max_depth': None, 'min_sample_leaf': 5},bets auc on val set:0.7789678135405105
test auc:0.7407893809940727


In [12]:
re2

([0.7607116430620643,
  0.7398635590599876,
  0.7705160916732264,
  0.7664211882787473,
  0.7407893809940727],
 [0.7705323708352608,
  0.765989758500598,
  0.7716563997262149,
  0.7652856416957028,
  0.7789678135405105])

In [13]:
np.mean(re2[0])

0.7556603726136196

## RPT sub dataset

### baseline

([0.7598548673944933,
  0.7363293328695115,
  0.7613958180650181,
  0.7656157832167196,
  0.736743853716954],
mean: 0.752

 [0.7616242874728514,
  0.7732647814910025,
  0.7595528825709252,
  0.7660234672023498,
  0.7666020922123209])

### KGE

simple KGE concatenated with row feature

([0.7607116430620643,
  0.7398635590599876,
  0.7705160916732264,
  0.7664211882787473,
  0.7407893809940727],
mean:0.75566
  
 [0.7705323708352608,
  0.765989758500598,
  0.7716563997262149,
  0.7652856416957028,
  0.7789678135405105])

In [16]:
np.mean(re2[0])

0.7519879310525394

In [None]:
cov=[]
for i in range(FraudData.shape[1]):
    cov.append(np.corrcoef(FraudData[:,i],FraudLabel)[0,1])
print(np.sort(cov))

In [None]:

a=np.corrcoef(rpt_financial_varibles.drop(['year','Stkcd'],axis=1),rowvar=False)

In [14]:
financial_featrues=rpt_financial_varibles
X=financial_featrues.drop(['year','Stkcd','label'],axis=1)
for column in list(X.columns[X.isna().sum() > 0]):
    mean_val = X[column].mean()
    X[column].fillna(mean_val, inplace=True)

X=X.values
y=financial_featrues['label'].values
import itertools
params_dict=[]
ne=list(range(2500,5010,250))
max_depth=[20,None]
min_leaf=[1,5,10]
iters=itertools.product(ne,max_depth,min_leaf)
for comb in iters:
    params_dict.append({'n_estimators':comb[0],'max_depth':comb[1],'min_sample_leaf':comb[2]})
print('total len of params comb:{}'.format(len(params_dict)))
exp=experiment_protocal(params_dict=params_dict,model='RF',exp_name='rpt_baseline',comment='RF baseline')
re2=exp.run(X,y,5)

total len of params comb:66
train test split 0


100%|██████████| 66/66 [20:38<00:00, 18.77s/it]


best params:{'n_estimators': 4500, 'max_depth': None, 'min_sample_leaf': 1},bets auc on val set:0.7616242874728514
test auc:0.7598548673944933
train test split 1


100%|██████████| 66/66 [20:19<00:00, 18.48s/it]


best params:{'n_estimators': 3250, 'max_depth': None, 'min_sample_leaf': 5},bets auc on val set:0.7732647814910025
test auc:0.7363293328695115
train test split 2


100%|██████████| 66/66 [20:19<00:00, 18.48s/it]


best params:{'n_estimators': 4750, 'max_depth': 20, 'min_sample_leaf': 10},bets auc on val set:0.7595528825709252
test auc:0.7613958180650181
train test split 3


100%|██████████| 66/66 [20:16<00:00, 18.44s/it]


best params:{'n_estimators': 3500, 'max_depth': 20, 'min_sample_leaf': 1},bets auc on val set:0.7660234672023498
test auc:0.7656157832167196
train test split 4


100%|██████████| 66/66 [20:27<00:00, 18.60s/it]


best params:{'n_estimators': 5000, 'max_depth': 20, 'min_sample_leaf': 5},bets auc on val set:0.7666020922123209
test auc:0.736743853716954


In [None]:
financial_featrues=rpt_financial_varibles
X=financial_featrues.drop(['year','Stkcd','IndustryCode','label','EquityNatureID'],axis=1)
for column in list(X.columns[X.isna().sum() > 0]):
    mean_val = X[column].mean()
    X[column].fillna(mean_val, inplace=True)

X=X.values
y=financial_featrues['label'].values
import itertools
params_dict=[]
ne=list(range(2500,5010,250))
max_depth=[20,None]
min_leaf=[1,5,10]
iters=itertools.product(ne,max_depth,min_leaf)
for comb in iters:
    params_dict.append({'n_estimators':comb[0],'max_depth':comb[1],'min_sample_leaf':comb[2]})
print('total len of params comb:{}'.format(len(params_dict)))
exp=experiment_protocal(params_dict=params_dict,model='RF',exp_name='tmp',comment='RF baseline')
re2=exp.run(X,y,5)

In [None]:
import networkx as nx
graph=pd.read_csv('./KG0.csv').values
edges=[[x[0],x[2]]for x in graph]
G=nx.Graph()
G.add_edges_from(edges)

adj=nx.adjacency_matrix(G)
adj_2=adj.dot(adj)
adj_1_2=adj_2.toarray()+adj.toarray()
nodes=list(G.nodes)
comp_adj=adj_1_2[0:12186,0:12186]
comp_nodes=nodes[0:12186]
np.place(comp_adj,comp_adj>0,1)
for i in range(len(comp_adj)):
    comp_adj[i,i]=0


select_index=[]
drop_index=[]
k=0
for i in tqdm(comp_nodes):
    year=int(i[0:4])
    cp=int(i[11:])
    ind=rpt_financial_varibles.query('Stkcd == @cp & year == @year')
    if len(ind)==0:
        drop_index.append(k)
        k+=1
        continue
    k+=1
    select_index.append(ind.index.values[0])
tmp=np.delete(comp_adj,drop_index,axis=0)
tmp=np.delete(tmp,drop_index,axis=1)
comp_features=rpt_financial_varibles.loc[select_index]
comp_features.to_csv('comp_Graph_features.csv',index=False)
pd.DataFrame(tmp).to_csv('comp_Graph_adj',index=False)

In [5]:
import networkx as nx
adj=pd.read_csv('comp_Graph_adj').values


In [3]:
from scipy.sparse import csr_matrix
adj=csr_matrix(adj)

In [6]:
del adj

In [8]:
del adj_1_2

In [10]:
comp_adj=adj.toarray()[0:12186,0:12186]

In [11]:
comp_adj=csr_matrix(comp_adj)

In [13]:
del adj,adj_2

In [14]:
g=nx.from_scipy_sparse_matrix(comp_adj)

In [5]:
import networkx as nx
graph=pd.read_csv('./KG1.csv')
graph=graph.loc[[graph.iloc[i,1]=='same.Company' or graph.iloc[i,1]=='relatedTransaction' for i in range(len(graph))]]
graph=graph.values
edges=[[x[0],x[2]]for x in graph]
G=nx.Graph()
G.add_edges_from(edges)

In [13]:
comp_node=rpt_financial_varibles.apply(lambda x:str(x['year'])+'.comp.'+str(x['Stkcd'] ),axis=1)

In [9]:
borad=pd.read_csv('./borad.csv')[['Stkcd','year','PersonID']]

In [21]:
borad['comp']=borad.apply(lambda x: str(x['year'])+'.comp.'+str(x['Stkcd']),axis=1)
borad['person']=borad.apply(lambda x: 'null'+'.person.'+str(x['PersonID']),axis=1)


In [22]:
borad_edges=borad[['comp','person']].values

In [23]:
edges=borad_edges

In [31]:
select_index=[]
k=0
for i in tqdm(industry_triples):
    if i[0] in comp_node.values :
        select_index.append(k)
    k+=1

100%|██████████| 9366/9366 [00:01<00:00, 5865.62it/s]


In [45]:
edges=np.array(industry_triples)

array(['2003.comp.600225', 'industry', '2003ind.A01'], dtype='<U16')

In [46]:
import networkx as nx
G=nx.Graph()
G.add_edges_from(edges[:,[0,2]])


In [47]:
adj=nx.adjacency_matrix(G)
adj_2=adj.dot(adj)
nodes=list(G.nodes)

In [48]:
mE=[]
for i in nx.from_scipy_sparse_matrix(adj_2).edges():
    if nodes[i[0]].split('.')[1]=='comp' and nodes[i[1]].split('.')[1]=='comp':
        mE.append([nodes[i[0]],'same.industry',nodes[i[1]]])

In [49]:
pd.DataFrame(mE).to_csv('sameIndustry.csv',index=False)

In [48]:
X+=mE
pd.DataFrame(X).to_csv('KG2.csv',index=False)

In [17]:
comp_adj=adj_1_2[0:9366,0:9366]

In [18]:
from scipy import sparse
sparse.save_npz('RT_SC_B_v2.npz',comp_adj)

In [19]:
rpt_financial_varibles.index=comp_node
rpt_financial_varibles=rpt_financial_varibles.loc[nodes[0:9366],:]

In [20]:
rpt_financial_varibles.to_csv('RT_SC_B_features_v2.csv',index=False)

In [21]:
cc=[]
for i in nx.connected_components(G):
    cc.append(i)
len_cc=[len(i) for i in cc]

In [22]:
G=nx.from_scipy_sparse_matrix(comp_adj)

In [23]:
cc=[]
for i in nx.connected_components(G):
    cc.append(i)
cc_len=[len(i) for i in cc]