In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm
import pickle

# Read and Understand Dataset 

***
**Read the `train.txt` file to dataframe**
***

In [None]:
train_data=pd.read_csv("train.txt", delimiter=",", header=None,names=['Neighbours'],index_col=False)

In [None]:
train_data['ID']=train_data['Neighbours'].apply(lambda x: x.split('\t')[0])  # Get the ID

In [None]:
train_data['Neighbours']=train_data['Neighbours'].apply(lambda x: x.split('\t')[1:]) # set up the neighbours

In [None]:
train_data=train_data[["ID","Neighbours"]] # set "ID" as the index

In [None]:
num_neighbours=[] # the number of successors for each source.
for elem in train_data['Neighbours']:
    num_neighbours.append(len(elem))
train_data['Num_neighbours']=num_neighbours

In [None]:
train_data.shape

# Transfer Data to Directed Graph and Analysis the Di-Graph

***
**Transfer the dataframe to the graph G=<V,E>**
***

In [None]:
num_source=train_data.shape[0]
sink=train_data.iloc[0,]
len_sink=len(sink)

In [None]:
train_data

In [None]:
# build the directed_graph
diG = nx.DiGraph()
for i in range(num_source):
    source=train_data.iloc[i,0] # The sources
    sinks=train_data.iloc[i,1] # Neighbours
    len_sink=len(sinks)
    for j in range(len_sink):
        sink=sinks[j]
        diG.add_edge(source,sink)
        if sink not in diG.nodes:
            diG.add_node(sink)      

In [None]:
list(diG.edges)[:5]

In [None]:
print("The Di-graph contains %d nodes and %d edges" %(len(diG.nodes),len(diG.edges)))

***
Save the graph to `graph.txt`
***

In [None]:
with open('graph.txt','wb') as file:
    pickle.dump(diG,file)

In [None]:
with open('graph.txt','rb') as file:
    graph=pickle.load(file)

In [None]:
len(graph.nodes)

***
Bulid the adjacent matrix for the `source nodes` to find the positive and neigative samples
***

In [None]:
nodelist=train_data.iloc[:,0]
nodelist.shape

In [None]:
adj_matrix=nx.to_numpy_matrix(graph,nodelist) # adjacent matrix, only consider the source nodes

In [None]:
adj_matrix.shape

In [None]:
with open ('adj_matrix.txt', 'wb') as file:
    pickle.dump(adj_matrixe, file)

***
**Gerenrate the positive samples:`edges_pairs` and negative samples `No_edges_pairs`**
***

In [None]:
# Negative Sample
No_edges_pairs = []

# traverse adjacency matrix
offset = 0
for i in range(adj_Matrix.shape[0]):
    for j in range(offset,adj_Matrix.shape[1]):
        if i != j:
            if adj_Matrix[i,j] == 0 and nx.shortest_path_length(graph,nodelist[i],nodelist[j])<=3:
                No_edges_pairs.append([nodelist[i],nodelist[j]])
    offset = offset + 1

In [None]:
# Positive Samples


In [None]:
# Combined Samples


In [None]:
with open('samples.txt','wb') as file:
    pickle.dump(sample,file)

In [None]:
with open('samples_all.txt','rb') as file:
    sample_data=pickle.load(file)

In [None]:
sample_data.shape

# Feature Exaction

***
**Vertices features**

Contains: Source_following, Sink_follows, shortest_path
***

## Find the Percentage of source's following --> `Source_following`

In [None]:
source=sample_data['Source']
#out_degree=np.zeros(sample_data.shape[0])
out=[]
for elem in source:
    out.append(graph.out_degree(elem))
maxs=max(out)

In [None]:
out_degree=list(i/maxs for i in out)

In [None]:
out_degree.count(1) #'761793' has the largest 'following' number

## Generate the Percentage of Sink's Follower--> `Sink_follows`

In [None]:
in_=[]
sink=sample_data['Sink']
for elem in sink:
    in_.append(graph.in_degree(elem))
max_num=max(in_)

In [None]:
in_degree=list(i/max_num for i in in_)

In [None]:
in_degree.count(1) # '3361377' has the most followers

## The shortest path between source and sink -->`shortest_path`

In [None]:
shortest_path=[]
n=sample_data.shape[0]
source_L=list(source) 
sink_L=list(sink)
for i in tqdm(range(n)):
    lenth=nx.shortest_path_length(graph,source_L[i],sink_L[i])
    shortest_path.append(lenth)

In [None]:
sample_data.head()

In [None]:
with open ('data6.txt', 'wb') as file:
    pickle.dump(sample_data, file)

In [None]:
adj_Matrix=nx.to_numpy_matrix(diG,nodelist)

***
**Feature Exaction: similarity**

Contains: resource allocation index, jaccard coefficient, adamic adar index, preferential attachment, cn_soundarajan_hopcroft, ra_index_soundarajan_hopcroft, within_inter_cluster
***


In [None]:
unG=graph.to_undirected()
nodelist=list(train_data.iloc[:,0]) # sourse node

In [None]:
nodelist=list(train_data.iloc[:,0]) # sourse node
rai = nx.resource_allocation_index(unG, nodelist) # resource_allocation_index
jc = nx.jaccard_coefficient(unG, nodelist) # jaccard coefficient
aai = nx.adamic_adar_index(unG, nodelist) # adamic adar index
pa = nx.preferential_attachment(unG, nodelist) # preferential attachment
csj = nx.cn_soundarajan_hopcroft(unG, nodelist) # soundarajan hopcroft
rjsh = nx.ra_index_soundarajan_hopcroft(unG, nodelist) # ra index soundarajan hopcroft
wic = nx.within_inter_cluster(unG, nodelist) #within_inter_cluster

# The Final Dataframe --> `sample_data`

***
**Build the new dataset with the seleted features**

features need to normalized: from sklearn.preprocessing import StandardScaler; scaler = StandardScaler(); scaler.fit_transform(data)
***

In [None]:
sample_data['Source_following']=out_degree
sample_data['Sink_follows']=in_degree
sample_data['shortest_path']=shortest_path

In [None]:
sample_data=sample_data[['Label','Source','Sink','Source_following','Sink_follows','shortest_path']] # put 'Label' to the first column

# Model Build

In [182]:
import pickle
from sklearn.model_selection import train_test_split

In [210]:
with open('Train_HHH.txt','rb') as file:
    dataset=pickle.load(file)
train, test = train_test_split(dataset, test_size=0.2, random_state=1)

In [211]:
train=train.drop(['Source','Sink'],axis=1)
test=test.drop(['Source','Sink'],axis=1)

In [212]:
train.head()

Unnamed: 0,RAI,JC,AAI,PA,Source_following,Sink_follows,Source_follows,Sink_following,Shortest_path,PageRank_Src,PageRank_Sink,ECentrality_Sour,ECentrality_Sink,Degree_Centrality_Sour,Degree_Centrality_Sink,Label,CN
5004,0.0,0.0,0.0,68829,68770,1,1068,0,1,8.055775e-07,2.0476e-07,0.012305,3.282663e-05,0.014349,2.054597e-07,1,0
17676,0.0,0.0,0.0,3956,3948,1,33,0,1,2.190402e-07,2.048017e-07,0.000345,9.122712e-07,0.000818,2.054597e-07,1,0
29485,0.000297,0.000453,1.45434,635202,35279,18,640,0,1,2.431027e-07,2.048212e-07,0.012662,0.0006620032,0.00738,3.698274e-06,1,16
36476,0.014785,0.02445,1.462639,37884,21,119,118,238,2,3.232158e-07,3.159942e-07,0.000704,0.0008544969,2.9e-05,7.334911e-05,0,10
45508,1.583936,0.011227,42.345879,15633376,21104,323,1820,621,2,2.470559e-06,4.178583e-07,0.01461,0.002531299,0.00471,0.0001939539,0,244


In [213]:
XT=train.drop('Label',axis=1)
xt=test.drop('Label',axis=1)

with open('XT.txt', 'wb') as file:
    pickle.dump(XT,file)
with open('xt1.txt', 'wb') as file:
    pickle.dump(xt,file)

## Logistic Regression

In [214]:
#XT=XT.drop(['RAI','JC','AAI','CN'],axis=1)
#xt=xt.drop(['RAI','JC','AAI','CN'],axis=1)

In [215]:
XT

Unnamed: 0,RAI,JC,AAI,PA,Source_following,Sink_follows,Source_follows,Sink_following,Shortest_path,PageRank_Src,PageRank_Sink,ECentrality_Sour,ECentrality_Sink,Degree_Centrality_Sour,Degree_Centrality_Sink,CN
5004,0.000000,0.000000,0.000000,68829,68770,1,1068,0,1,8.055775e-07,2.047600e-07,0.012305,3.282663e-05,0.014349,2.054597e-07,0
17676,0.000000,0.000000,0.000000,3956,3948,1,33,0,1,2.190402e-07,2.048017e-07,0.000345,9.122712e-07,0.000818,2.054597e-07,0
29485,0.000297,0.000453,1.454340,635202,35279,18,640,0,1,2.431027e-07,2.048212e-07,0.012662,6.620032e-04,0.007380,3.698274e-06,16
36476,0.014785,0.024450,1.462639,37884,21,119,118,238,2,3.232158e-07,3.159942e-07,0.000704,8.544969e-04,0.000029,7.334911e-05,10
45508,1.583936,0.011227,42.345879,15633376,21104,323,1820,621,2,2.470559e-06,4.178583e-07,0.014610,2.531299e-03,0.004710,1.939539e-04,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50057,0.045586,0.005155,0.709407,83996,197,290,236,45,2,2.199372e-06,5.569055e-07,0.000265,9.904954e-04,0.000089,6.882899e-05,3
32511,0.039300,0.008325,7.437408,2287695,314,649,246,6442,2,6.908137e-07,3.860740e-07,0.001686,8.935783e-03,0.000115,1.456915e-03,57
5192,0.000000,0.000000,0.000000,4944,617,8,10,0,1,2.063114e-07,2.054147e-07,0.000295,9.165149e-05,0.000129,1.643677e-06,0
12172,0.000335,0.000304,0.227845,26292,6565,4,177,0,1,2.632149e-07,2.048453e-07,0.002223,6.109002e-05,0.001385,8.218387e-07,2


In [216]:
x_train = XT
y_train = train.Label

x_test = xt
y_test = test.Label

In [217]:
x_train

Unnamed: 0,RAI,JC,AAI,PA,Source_following,Sink_follows,Source_follows,Sink_following,Shortest_path,PageRank_Src,PageRank_Sink,ECentrality_Sour,ECentrality_Sink,Degree_Centrality_Sour,Degree_Centrality_Sink,CN
5004,0.000000,0.000000,0.000000,68829,68770,1,1068,0,1,8.055775e-07,2.047600e-07,0.012305,3.282663e-05,0.014349,2.054597e-07,0
17676,0.000000,0.000000,0.000000,3956,3948,1,33,0,1,2.190402e-07,2.048017e-07,0.000345,9.122712e-07,0.000818,2.054597e-07,0
29485,0.000297,0.000453,1.454340,635202,35279,18,640,0,1,2.431027e-07,2.048212e-07,0.012662,6.620032e-04,0.007380,3.698274e-06,16
36476,0.014785,0.024450,1.462639,37884,21,119,118,238,2,3.232158e-07,3.159942e-07,0.000704,8.544969e-04,0.000029,7.334911e-05,10
45508,1.583936,0.011227,42.345879,15633376,21104,323,1820,621,2,2.470559e-06,4.178583e-07,0.014610,2.531299e-03,0.004710,1.939539e-04,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50057,0.045586,0.005155,0.709407,83996,197,290,236,45,2,2.199372e-06,5.569055e-07,0.000265,9.904954e-04,0.000089,6.882899e-05,3
32511,0.039300,0.008325,7.437408,2287695,314,649,246,6442,2,6.908137e-07,3.860740e-07,0.001686,8.935783e-03,0.000115,1.456915e-03,57
5192,0.000000,0.000000,0.000000,4944,617,8,10,0,1,2.063114e-07,2.054147e-07,0.000295,9.165149e-05,0.000129,1.643677e-06,0
12172,0.000335,0.000304,0.227845,26292,6565,4,177,0,1,2.632149e-07,2.048453e-07,0.002223,6.109002e-05,0.001385,8.218387e-07,2


In [219]:
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression(penalty ='none')
LR_model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [220]:
label_predict=LR_model.predict(x_test)
prob_preidct=LR_model.predict_proba(x_test)

In [221]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,label_predict)

0.6924432528605806

In [254]:
with open('16_test.txt','rb') as file:
    test=pickle.load(file)

In [223]:
test=test.drop(['Source','Sink'],axis=1)

In [224]:
with open('ID.txt','rb') as file:
    ID=pickle.load(file)

In [225]:
#test=test.drop(['RAI','JC','AAI','CN',],axis=1)
prob_preidct=LR_model.predict_proba(test)

In [226]:
prob_preidct #predict probabilities

array([[0.50000064, 0.49999936],
       [0.50000064, 0.49999936],
       [0.50000413, 0.49999587],
       ...,
       [0.50000006, 0.49999994],
       [0.50000016, 0.49999984],
       [0.50000048, 0.49999952]])

In [None]:
def submission(prob_preidct, LR):
    with open(LR, 'w') as file:
        file.write('Id,Predicted\n')
        for i, p in zip(ID, prob_preidct):
            file.write("{},{}\n".format(i, p[0]))

In [None]:
submission(prob_preidct, 'LRsubmit.csv')

### SGDClassifier, loss='log' 

In [303]:
import pickle

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier,SGDRegressor

In [304]:
with open('Train_HHH.txt','rb') as file:
    dataset=pickle.load(file)
train, test = train_test_split(dataset, test_size=0.2, random_state=1)
scaler = StandardScaler()

In [305]:
XT=train.drop('Label',axis=1)
xt=test.drop('Label',axis=1)

In [306]:
x_train = scaler.fit_transform(XT.iloc[:,2:])
y_train = train.Label

x_test = scaler.transform(xt.iloc[:,2:])
y_test = test.Label

In [317]:
SDGcl=SGDClassifier(loss='log',max_iter=1000, tol=1e-3,fit_intercept=False) #'log':0.51 
SDGcl.fit(x_train,y_train)
#SDGre=SGDRegressor(loss='squared_loss',max_iter=1000, tol=1e-3) # do not have prob_predict
#SDGre.fit(x_train,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=False,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [318]:
label_predictC=SDGcl.predict(x_test)
prob_preidctC=SDGcl.predict_proba(x_test)
#label_predictR=SDGre.predict(x_test)
#prob_preidctR=SDGre.predict_proba(x_test)

In [319]:
prob_preidctC

array([[8.88785246e-05, 9.99911121e-01],
       [7.14694509e-04, 9.99285305e-01],
       [9.99992909e-01, 7.09131293e-06],
       ...,
       [7.61114216e-03, 9.92388858e-01],
       [3.37311567e-04, 9.99662688e-01],
       [3.17739186e-01, 6.82260814e-01]])

In [320]:
cl=roc_auc_score(y_test,label_predictC)
#re=roc_auc_score(y_test,label_predictR)\nThe ROC of SDGRegression is %f'
print('The ROC of SDGClassifier is %f' %(cl))

The ROC of SDGClassifier is 0.974536


In [345]:
with open('16_test.txt','rb') as file:
    test=pickle.load(file)

In [322]:
test = scaler.transform(test.iloc[:,2:])
prob_preidctCT=SDGcl.predict_proba(test)

In [323]:
label_predictC

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [324]:
def submission(probs, filename):
    with open(filename, 'w') as file:
        file.write('Id,Predicted\n')
        for i, p in zip(ID, probs):
            file.write("{},{}\n".format(i, p[1]))

In [325]:
submission(prob_preidctC, 'SDGclsubmit.csv')

In [316]:
cc = train_data=pd.read_csv('SDGclsubmit.csv')
cc

Unnamed: 0,Id,Predicted
0,1,0.999921
1,2,0.999569
2,3,0.000009
3,4,0.990130
4,5,0.000887
...,...,...
1995,1996,0.000037
1996,1997,0.000048
1997,1998,0.006918
1998,1999,0.000038


## Random Forest, Decision Tree, KNN
Random foreset: AUC=0.744

In [126]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn import metrics

In [None]:
clfs = [KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier()]
for clf in clfs:
    clf.fit(x_train, y_train)
    print (clf)
    print(clf.score(x_test, y_test))
    predictions = clf.predict(x_test)
    print ("accuracy")
    print (accuracy_score(y_test,predictions))
    print (classification_report(y_test,predictions))
    print("AUC")
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label=1)
    print(metrics.auc(fpr, tpr))
    probs = clf.predict_proba(x_test)
    print("Prob")
    print(probs)

In [110]:
with open('16_test.txt', 'rb') as file:
    samples_test = pickle.load(file)

In [111]:
samples_test = samples_test.drop('Source', axis=1)

In [129]:
#samples_test = samples_test.drop('Sink', axis=1)
samples_test=samples_test.loc[:,['RAI','JC','AAI','PA','CN']]

In [130]:
samples_test

Unnamed: 0,RAI,JC,AAI,PA,CN
0,0.000000,0.000000,0.000000,667,0
1,0.000000,0.000000,0.000000,666,0
2,0.004624,0.011152,0.462729,4335,3
3,0.000028,0.003670,0.178805,18396,2
4,0.000000,0.000000,0.000000,966,0
...,...,...,...,...,...
1995,0.000000,0.000000,0.000000,116,0
1996,0.000049,0.006061,0.100722,5125,1
1997,0.000000,0.000000,0.000000,58,0
1998,0.000000,0.000000,0.000000,171,0


In [None]:
prob_list = []
clfs = [KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier()]
for clf in clfs:
    clf.fit(x_train, y_train)
    print (clf)
    predictions = clf.predict(samples_test)
    print("Predictions")
    print(predictions)
    probs = clf.predict_proba(samples_test)
    print("Prob")
    print(probs)
    prob_list.append([probs])

In [None]:
KNeighborsClassifier = prob_list[0]

In [None]:
KNeighborsClassifier =KNeighborsClassifier[0]

In [None]:
DecisionTreeClassifier = prob_list[1]

In [None]:
DecisionTreeClassifier=DecisionTreeClassifier[0]

In [132]:
RandomForestClassifier = prob_list[2]

In [133]:
RandomForestClassifier=RandomForestClassifier[0]

In [134]:
def submission(probs, filename):
    with open(filename, 'w') as file:
        file.write('Id,Predicted\n')
        for i, p in zip(ID, probs):
            file.write("{},{}\n".format(i, p[1]))

In [None]:
submission(DecisionTreeClassifier, 'DTsubmit.csv')

In [None]:
DT = train_data=pd.read_csv('DTsubmit.csv')
DT

In [None]:
submission(KNeighborsClassifier, 'KNsubmit.csv')

In [None]:
KNN = train_data=pd.read_csv('KNsubmit.csv')
KNN

In [135]:
submission(RandomForestClassifier, 'RFsubmit.csv')

In [136]:
RF = train_data=pd.read_csv('RFsubmit.csv')
RF

Unnamed: 0,Id,Predicted
0,1,0.940000
1,2,0.940000
2,3,0.280000
3,4,0.720000
4,5,0.587489
...,...,...
1995,1996,1.000000
1996,1997,0.490000
1997,1998,1.000000
1998,1999,0.813167


## SVM

In [383]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

In [384]:
with open('Train_HHH.txt','rb') as file:
    dataset=pickle.load(file)
train, test = train_test_split(dataset, test_size=0.2, random_state=1)
scaler = StandardScaler()

In [385]:
x_train = scaler.fit_transform(XT.iloc[:,2:])
y_train = train.Label

x_test = scaler.transform(xt.iloc[:,2:])
y_test = test.Label

In [386]:
C_range = np.logspace(-2, 5, 5)
gamma_range = np.logspace(-6, 1, 8)

In [387]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=1)


In [388]:
grid = GridSearchCV(SVC(kernel='rbf'), param_grid={'gamma': gamma_range, 'C': C_range}, cv=cv)


In [None]:
grid.fit(x_train, y_train)
print("The best parameters are {0.best_params_} with an accuracy of {0.best_score_:.3g}".format(grid))

In [None]:
scores = grid.cv_results_['mean_test_score'].reshape(C_range.size, gamma_range.size)

plt.figure(figsize=(8, 6))
plt.imshow(scores, cmap='viridis')
plt.colorbar(shrink=0.7)
plt.xticks(np.arange(len(gamma_range)), ["%.2e" % gamma for gamma in gamma_range], rotation=90)
plt.yticks(np.arange(len(C_range)), ["%1.e" % C for C in C_range])
plt.title('Cross validation accuracy')
plt.xlabel(r'$\gamma$')
plt.ylabel('$C$')
plt.show()


## MLP

In [333]:
from sklearn.neural_network import MLPClassifier

In [377]:
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10),activation='logistic', random_state=1,max_iter=3000)

In [378]:
clf.fit(x_train, y_train)

MLPClassifier(activation='logistic', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=10, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=3000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [379]:
prob_pre=clf.predict_proba(x_test)
label_pre=clf.predict(x_test)

In [380]:
label_pre

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [381]:
mlp=roc_auc_score(y_test,label_pre)

In [382]:
mlp

0.9911092140455865

In [374]:
#test = scaler.transform(test.iloc[:,2:])
prob_pre=clf.predict(test)

In [375]:
clf.predict(test)

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

# features 

In [None]:
with open('train_60000.txt', 'rb') as file:
    test = pickle.load(file)

In [None]:
Source_s=test.Source
sink_s=test.Sink

In [None]:
print(Source_s.iloc[1],Source_s.iloc[30001])
print(sink_s.iloc[1],sink_s.iloc[30001])

In [None]:
shortest_path=[]
for i in tqdm(range(2000)):
    try:
        lenth=nx.shortest_path_length(graph,Source_s[i],sink_s[i])
        shortest_path.append(lenth)
    except:
        lenth = 0
        shortest_path.append(lenth)

In [None]:
shortest_path[:5]

In [None]:
test['Shortest_path'] = shortest_path

In [None]:
test[:5]

In [None]:
with open('16_test.txt', 'wb') as file:
    pickle.dump(test,file)

In [None]:
positive=sample_data[:30000]

In [None]:
negative=sample_data[-30000:]

In [None]:
train_60000=positive.append(negative)

In [None]:
len(train_60000)

In [None]:
train_60000

In [None]:
with open('train_60000.txt', 'wb') as file:
    pickle.dump(train_60000,file)

In [None]:
positive

In [None]:
sink=train_60000.Sink

In [None]:
source=train_60000.Source
source.iloc[0]
#len(source)

In [None]:
number = []
for i in tqdm(range(len(source))):
    Source_successors = graph.neighbors(source.iloc[i])
    number.append(len(list(Source_successors)))

In [None]:
train_60000['Source_following'] = number

In [None]:
in_=[]
for elem in sink:
    in_.append(graph.in_degree(elem))

In [None]:
train_60000['Sink_follows'] = in_

In [None]:
out=[]
for elem in sink:
    out.append(graph.out_degree(elem))

In [None]:
train_60000['Sink_following'] = out

In [None]:
in_=[]
for elem in source:
    in_.append(graph.in_degree(elem))

In [None]:
train_60000['Source_follows'] = in_

In [None]:
train_60000[:5]

In [None]:
shortest_path=[]
#sink_s = list(train_60000['Sink'])
nn=len(sink)
for i in tqdm(range(nn)):
    try:
        lenth=nx.shortest_path_length(graph,source.iloc[i],sink.iloc[0])
        shortest_path.append(lenth)
    except:
        lenth = 0
        shortest_path.append(lenth)

In [None]:
max(shortest_path)

In [None]:
train_60000['Shortest_path'] = shortest_path

In [None]:
train_60000[:5]

In [None]:
with open('Shortest_path.txt', 'wb') as file:
    pickle.dump(train_60000,file)

In [None]:
test = []
for i in tqdm(range(len(source))):
    p = len(list(nx.common_neighbors(un_graph, source[i], sink[i])))
    test.append([(str(source[i]), str(sink[i])), p])

In [None]:
with open('AAI.txt', 'rb') as file:
    AAI = pickle.load(file)

In [None]:
preds = nx.common_neighbor_centrality

In [None]:
with open('negative_samples.txt', 'rb') as file:
    test = pickle.load(file)

In [None]:
test

In [None]:
with open('samples_all.txt', 'rb') as file:
    test = pickle.load(file)

In [None]:
aa=test.Source

In [None]:
aa.iloc[30000:300019]

In [None]:
aa.iloc[0:19]