# Project Machine Learning 1
## Predict missing citations in the citation network
### Team: PHUNG
In this project we are going to predict if there is a citiation between 2 paper using a classification model.

## I. Import libraries

In [None]:
# Data tools
import csv
import numpy as np
import re
import itertools
import time
# For textual features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
# For graphical features
import networkx as nx
from community import community_louvain
# Classifiers
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
# keras modules
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
from keras.callbacks import ModelCheckpoint
# Other ML tools
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

## II. Import the data
Our dataset contains 27,770 articles from the period  1992-2003,  each  contains  6  informations  of the paper including: <br>
(1)  paper unique ID <br>
(2)  publication year (from 1992 to 2003) <br>
(3)  title <br>
(4)  authors <br>
(5)  name of journal <br>
(6)  abstract <br>
### Load paper information
The information is stored in the file: *node_information.csv*. We use library csv to load it :

In [None]:
node_info = []
with open("data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)
IDs = [element[0] for element in node_info]

In [None]:
print(node_info[1])

### Load training data
The training data consists of 615,512 rows, each contains a node pair with an associated value 1 if there is a citation between the 2 nodes, 0 if not. <br> 
Example: <br>
9510123 9502114 1 <br> 
9707075 9604178 1 <br> 
9312155 9506142 0 <br> 
...

In [None]:
with open("data/training_set.txt", "r") as f:
    reader = csv.reader(f)
    full_training_data  = list(reader)
full_training_data = [element[0].split(" ") for element in full_training_data]

In [None]:
print(len(full_training_data))
print(full_training_data[:3])

### Load testing data
The testing data consists of 32,648 rows each contains a node pair that we have to predict if there is a citation between them.

In [None]:
with open("data/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    full_testing_data  = list(reader)
full_testing_data = [element[0].split(" ") for element in full_testing_data]

In [None]:
print(len(full_testing_data))
print(full_testing_data[:3])

### Split the training data into a train set and a test set
To asset our model, we need a test set with the true output. Hence we will split the training data into a train set and a test set. <br>
Here, we choose 95% of our training data as the train set.

In [None]:
# Run if you want to shuffle the data
# NOTE: If you shuffle the data and you want to export the features, 
#       you have to save the shuffled data as well to save the new order of the data
np.random.shuffle(full_training_data)

In [None]:
# The ratio of data to be taken into the train set
#NOTE: If you modify this, you JUST have to recalculate the citaion graph
#      and its features. DO NOT NEED to recalculate other features as it's the same.
to_take = 0.95

# Split the training data
n_split = int(to_take*len(full_training_data))
train_set = full_training_data[:n_split]
test_set = full_training_data[n_split:]

## III. Extract the features
We are going to exact all the features for our model. <br>
### Textual features
The textual features contains: <br>
(1) Cosine similarity of abstracts <br>
(2) Number of overlap words in abstract <br>
(3) Cosine similarity of titles <br>
(4) Number of overlap words in title <br>
(5) Number of overlap words between target's title and source's abstract
<br>
<br>
This's gonna take a little bit long time ...

In [None]:
cosine_title_out = []
cosine_title_test_out = []
cosine_abstract_out = []
cosine_abstract_test_out = []
overlap_title_out = []
overlap_title_test_out = []
overlap_abstract_out = []
overlap_abstract_test_out = []
overlap_title_in_abstract_out = []
overlap_title_in_abstract_test_out = []

# For tokenize the paragraph
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()
vectorizer = TfidfVectorizer(stop_words="english")

# Transform titles and abstracts into TFIDF vectors
titles = [element[2] for element in node_info]
features_TFIDF_title = vectorizer.fit_transform(titles)

abstracts = [element[5] for element in node_info]
features_TFIDF_abstract = vectorizer.fit_transform(abstracts)

# Start extract the features
counter = 0
for data in full_training_data[:1000]:
    source_id = IDs.index(data[0])
    target_id = IDs.index(data[1])
    source_info = node_info[source_id]
    target_info = node_info[target_id]
    
    # Overlap title
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    overlap_title_out.append(len(set(source_title).intersection(set(target_title))))
    
    # Overlap abstract
    source_abstract = source_info[5].lower().split(" ")
    source_abstract = [token for token in source_abstract if token not in stpwds]
    source_abstract = [stemmer.stem(token) for token in source_abstract]
    target_abstract = target_info[5].lower().split(" ")
    target_abstract = [token for token in target_abstract if token not in stpwds]
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    overlap_abstract_out.append(len(set(source_abstract).intersection(set(target_abstract))))
    
    # Cosine similarity title
    cosine_title_out.append(cosine_similarity(features_TFIDF_title[source_id],features_TFIDF_title[target_id])[0,0])
    
    # Cosine similarity abstract
    cosine_abstract_out.append(cosine_similarity(features_TFIDF_abstract[source_id],features_TFIDF_abstract[target_id])[0,0])
    
    # Overlap words of target title in source abstract
    overlap_title_in_abstract_out.append(len(set(source_abstract).intersection(set(target_title))))
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "training examples processsed")
    
counter = 0
for data in full_testing_data[:1000]:
    source_id = IDs.index(data[0])
    target_id = IDs.index(data[1])
    source_info = node_info[source_id]
    target_info = node_info[target_id]
    
    # Overlap title
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    overlap_title_test_out.append(len(set(source_title).intersection(set(target_title))))
    
    # Overlap abstract
    source_abstract = source_info[5].lower().split(" ")
    source_abstract = [token for token in source_abstract if token not in stpwds]
    source_abstract = [stemmer.stem(token) for token in source_abstract]
    target_abstract = target_info[5].lower().split(" ")
    target_abstract = [token for token in target_abstract if token not in stpwds]
    target_abstract = [stemmer.stem(token) for token in target_abstract]
    overlap_abstract_test_out.append(len(set(source_abstract).intersection(set(target_abstract))))
    
    # Cosine similarity title
    cosine_title_test_out.append(cosine_similarity(features_TFIDF_title[source_id],features_TFIDF_title[target_id])[0,0])
    
    # Cosine similarity abstract
    cosine_abstract_test_out.append(cosine_similarity(features_TFIDF_abstract[source_id],features_TFIDF_abstract[target_id])[0,0])
    
    # Overlap words of target title in source abstract
    overlap_title_in_abstract_test_out.append(len(set(source_abstract).intersection(set(target_title))))
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "testing examples processsed")

### Graphical features
For this part we are going to use the NetworkX library.
#### Citation graph's features
The expected features are: <br>
(6) Number of common neighbourhoods<br>
(7) Link-based Jaccard coefficient<br>
(8) Adamic-Adar index<br>
(9) Preference attachment<br>
(10) Difference in betweenness centrality<br>
(11) Difference in the number of in-links<br>
(12) Number of times target cited<br>
(13) Pagerank of source<br>
(14) Pagerank of target<br>
(15) Minimal distance*<br>
(16) Is the same cluster?<br>
##### Import the graphs
For this part we need 2 graphs, an undirected graph and a directed graph

In [None]:
G=nx.Graph()
dG=nx.DiGraph()

# Adding nodes
for node in node_info:
    G.add_node(node[0])
    dG.add_node(node[0])
    
# Adding edges, we only take the egdes data from the train set
for data in train_set:
    if data[2] == '1':
        G.add_edge(data[0],data[1])
        dG.add_edge(data[0],data[1])

##### Extract the features

In [None]:
common_nei_out = []
common_nei_test_out = []
jaccard_coef_out = []
jaccard_coef_test_out = []
aa_index_out = []
aa_index_test_out = []
pref_att_out = []
pref_att_test_out = []
btw_diff_out = []
btw_diff_test_out = []
in_link_out = []
in_link_test_out = []
num_cite_out = []
num_cite_test_out = []
pr_source_out = []
pr_source_test_out = []
pr_target_out = []
pr_target_test_out = []
shortest_path_out = []
shortest_path_test_out = []
same_cluster_out = []
same_cluster_test_out = []

# Calulate the Betweenness centrality
print('Calculating the Betweenness centrality...')
# the higher k is, the more exact the results are, can set k to 1000 but it would take 5-10 mins
btw = nx.betweenness_centrality(G, k = 100)
print('Completed!')

# Calculate the Pagerank of the network
print('Calculating the Pagerank...')
pr = nx.pagerank(G, alpha=0.9)
print('Completed!')

# Calculate the clustering of the network
print('Calculating the cluster...')
par =  community_louvain.best_partition(G)
print('Completed!')

# Start extract the features
counter = 0
for data in full_training_data:
    # Common neighborhoods
    common_nei_out.append(len(list(nx.common_neighbors(G, data[0], data[1]))))
    
    # Jaccard coefficient
    jaccard_coef_out.append(list(nx.jaccard_coefficient(G, [(data[0], data[1])]))[0][2])
    
    # Adamic-Adar index
    aa_index_out.append(list(nx.adamic_adar_index(G, [(data[0], data[1])]))[0][2])
    
    # Preferential attachment
    pref_att_out.append(list(nx.preferential_attachment(G, [(data[0], data[1])]))[0][2])
    
    # Difference in the Betweenness centrality
    btw_diff_out.append(btw[data[1]]-btw[data[0]])
    
    # Difference in the number of in-links
    in_link_out.append(len(dG.in_edges(data[1])) - len(dG.in_edges(data[0])))
    
    # Number of times cited
    num_cite_out.append(len(dG.in_edges(data[1])))
    
    # Pagerank of source
    pr_source_out.append(pr[data[0]])
    
    # Pagerank of target
    pr_target_out.append(pr[data[1]])
    
    # Is in the same cluster
    if(par[data[0]]==par[data[1]]):
        same_cluster_out.append(1)
    else: same_cluster_out.append(0)
        
    # Minimal distance*
    if (data[0], data[1]) in G.edges():
        G.remove_edge(data[0], data[1])
        try:
            shortest_path_out.append(nx.shortest_path_length(G, source=data[0], target= data[1]))
        except:
            shortest_path_out.append(0)
        G.add_edge(data[0], data[1])
    else:
        try:
            shortest_path_out.append(nx.shortest_path_length(G, source=data[0], target= data[1]))
        except:
            shortest_path_out.append(0)
    
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "training examples processsed")
    
counter = 0
for data in full_testing_data:
    # Common neighborhoods
    common_nei_test_out.append(len(list(nx.common_neighbors(G, data[0], data[1]))))
    
    # Jaccard coefficient
    jaccard_coef_test_out.append(list(nx.jaccard_coefficient(G, [(data[0], data[1])]))[0][2])
    
    # Adamic-Adar index
    aa_index_test_out.append(list(nx.adamic_adar_index(G, [(data[0], data[1])]))[0][2])
    
    # Preferential attachment
    pref_att_test_out.append(list(nx.preferential_attachment(G, [(data[0], data[1])]))[0][2])
    
    # Difference in the Betweenness centrality
    btw_diff_test_out.append(btw[data[1]]-btw[data[0]])
    
    # Difference in the number of in-links
    in_link_test_out.append(len(dG.in_edges(data[1])) - len(dG.in_edges(data[0])))
    
    # Number of times cited
    num_cite_test_out.append(len(dG.in_edges(data[1])))
    
    # Pagerank of source
    pr_source_test_out.append(pr[data[0]])
    
    # Pagerank of target
    pr_target_test_out.append(pr[data[1]])
    
    # Is in the same cluster
    if(par[data[0]]==par[data[1]]):
        same_cluster_test_out.append(1)
    else: same_cluster_test_out.append(0)
        
    # Minimal distance*
    if (data[0], data[1]) in G.edges():
        G.remove_edge(data[0], data[1])
        try:
            shortest_path_test_out.append(nx.shortest_path_length(G, source=data[0], target= data[1]))
        except:
            shortest_path_test_out.append(0)
        G.add_edge(data[0], data[1])
    else:
        try:
            shortest_path_test_out.append(nx.shortest_path_length(G, source=data[0], target= data[1]))
        except:
            shortest_path_test_out.append(0)
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "testing examples processsed")


#### Author collaboration graph's features
The expected features are: <br>
(17) Number of common neighbourhoods <br>
(18) Link-based Jaccard coefficient <br>
(19) Preference attachment <br>
(20) Adamic-Adar index <br>
##### Clean and import the authors data
We'll try to remove all the parentheses in the author information to give a better result

In [None]:
# RegEx for the parentheses
my_regex = "\([^()]*\)"
# RegEx to remove the spaces after . , eg. A. Brown -> A.Brown
my_regex_2 = "\.\s+"

author_list = set()
for i in range(len(node_info)):
    auths = re.sub(my_regex, "", node_info[i][3])
    auths = auths.split("(")[0]
    auths = auths.split(",")
    corrected_auth = ""
    for j,auth in enumerate(auths):
        auth = auth.lstrip().rstrip()
        auth = re.sub(my_regex_2, ".", auth)
        if len(auth) > 1:
            if corrected_auth != "":
                corrected_auth += ","
            author_list.add(auth)
            corrected_auth += auth
    node_info[i][3] = corrected_auth

##### Import the graph
For this part we only need the undirected graph

In [None]:
G_collab=nx.Graph()

for auth in author_list:
    G_collab.add_node(auth)
    
for node in node_info:
    auths = node[3].split(",")
    if len(auths) > 1:
        for pair in itertools.combinations(auths, 2):
            authA = pair[0]
            authB = pair[1]
            if G_collab.has_edge(authA,authB):
                G_collab[authA][authB]['weight'] += 1
            else:
                G_collab.add_edge(authA,authB, weight=1)

In [None]:
G_collab.number_of_nodes()

##### Extract the features
In this part we consider a paper's neiborhood as the union the neighoborhoods of all the authors.

In [None]:
collab_comm_nei_out = []
collab_comm_nei_test_out = []
collab_jaccard_coef_out = []
collab_jaccard_coef_test_out = []
collab_aa_out = []
collab_aa_test_out = []
collab_pa_out = []
collab_pa_test_out = []

counter = 0
for data in full_training_data:
    authorsA = node_info[IDs.index(data[0])][3].split(",")
    authorsB = node_info[IDs.index(data[1])][3].split(",")
    
    while "" in authorsA:
        authorsA.remove("")
    while "" in authorsB:
        authorsB.remove("")
    
    # Calculate the neighbor set of source and target
    neiborsA = set()
    for author in authorsA:
        neiborsA = neiborsA.union(set(list(G_collab.neighbors(author))))
    neiborsB = set()
    for author in authorsB:
        neiborsB = neiborsB.union(set(list(G_collab.neighbors(author))))
        
    # Number of common neighbors
    collab_comm_nei_out.append(len(neiborsA.intersection(neiborsB)))
    
    # Jaccard coeffient
    if len(neiborsA.union(neiborsB)) != 0 :
        collab_jaccard_coef_out.append(len(neiborsA.intersection(neiborsB))/float(len(neiborsA.union(neiborsB))))
    else:
        collab_jaccard_coef_out.append(0)
    
    # Preferential attachment
    collab_pa_out.append(len(neiborsA)*len(neiborsB))
    
    # Adamic-Adar index, we won't use directly the Network library as we consider all the authors as a node
    aa = 0.    
    for Z in neiborsA.intersection(authorsB):
        if len(list(G_collab.neighbors(Z))) > 1:
            aa += 1./np.log(len(list(G_collab.neighbors(Z))))
    collab_aa_out.append(aa)
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "training examples processsed")
    
counter = 0
for data in full_testing_data:
    authorsA = node_info[IDs.index(data[0])][3].split(",")
    authorsB = node_info[IDs.index(data[1])][3].split(",")
    
    while "" in authorsA:
        authorsA.remove("")
    while "" in authorsB:
        authorsB.remove("")
    
    # Calculate the neighbor set of source and target
    neiborsA = set()
    for author in authorsA:
        neiborsA = neiborsA.union(set(list(G_collab.neighbors(author))))
    neiborsB = set()
    for author in authorsB:
        neiborsB = neiborsB.union(set(list(G_collab.neighbors(author))))
        
    # Number of common neighbors
    collab_comm_nei_test_out.append(len(neiborsA.intersection(neiborsB)))
    
    # Jaccard coeffient
    if len(neiborsA.union(neiborsB)) != 0 :
        collab_jaccard_coef_test_out.append(len(neiborsA.intersection(neiborsB))/float(len(neiborsA.union(neiborsB))))
    else:
        collab_jaccard_coef_test_out.append(0)
        
    # Preferential attachment
    collab_pa_test_out.append(len(neiborsA)*len(neiborsB))
    
    # Adamic-Adar index
    aa = 0.    
    for Z in neiborsA.intersection(authorsB):
        if len(list(G_collab.neighbors(Z))) > 1:
            aa += 1./np.log(len(list(G_collab.neighbors(Z))))
    collab_aa_test_out.append(aa)
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "testing examples processsed")
    

##### Other features
Including: <br>
(21) Difference in publication year <br>
(22) Journal popularity of target <br>
(23) Is the same journal? <br>
(24) The number of common authors <br>
(25) Is self-cited? <br>

In [None]:
temp_diff_out = []
temp_diff_test_out = []
journal_pop_out = []
journal_pop_test_out = []
same_journ_out = []
same_journ_test_out = []
comm_auth_out = []
comm_auth_test_out = []
self_cite_out = []
self_cite_test_out = []

# For journal popularity, we need to preprocess a little...
journal_dict = {}

for node in node_info:
    if node[4] != "":
        if node[4] not in journal_dict:
            journal_dict[node[4]] = 1
        else:
            journal_dict[node[4]] += 1            
journal_dict[""] = 0.
        
# Normalize the value
factor=1.0/sum(journal_dict.values())
for k in journal_dict:
      journal_dict[k] = journal_dict[k]*factor

counter = 0
for data in full_training_data:
    source_info = node_info[IDs.index(data[0])]
    target_info = node_info[IDs.index(data[1])]
    
    # Year's difference
    temp_diff_out.append(int(source_info[1]) - int(target_info[1]))
    
    # Journal's popularity of target 
    journal_pop_out.append(journal_dict[target_info[4]])
    
    # Is same journal ?
    same_journ_out.append(int(source_info[4]==target_info[4]))
    
    # Number of common authors
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    comm_auth_out.append(len(set(source_auth).intersection(set(target_auth))))
    
    # Is self-cite?
    self_cite_out.append(int(len(set(source_auth).intersection(set(target_auth)))>0))
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "training examples processsed")
    
counter = 0
for data in full_testing_data:
    source_info = node_info[IDs.index(data[0])]
    target_info = node_info[IDs.index(data[1])]
    
    # Year's difference
    temp_diff_test_out.append(int(source_info[1]) - int(target_info[1]))
    
    # Journal's popularity of target 
    journal_pop_test_out.append(journal_dict[target_info[4]])
    
    # Is same journal ?
    same_journ_test_out.append(int(source_info[4]==target_info[4]))
    
    # Number of common authors
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    comm_auth_test_out.append(len(set(source_auth).intersection(set(target_auth))))
    
    # Is self-cite?
    self_cite_test_out.append(int(len(set(source_auth).intersection(set(target_auth)))>0))    
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "testing examples processsed")

### Load all the features

In [None]:
features_data = np.array([cosine_title_out ,
                          cosine_abstract_out ,
                          overlap_title_out ,
                          overlap_abstract_out ,
                          overlap_title_in_abstract_out ,
                          common_nei_out ,
                          jaccard_coef_out ,
                          aa_index_out ,
                          pref_att_out ,
                          btw_diff_out ,
                          in_link_out ,
                          num_cite_out ,
                          pr_source_out ,
                          pr_target_out ,
                          shortest_path_out ,
                          same_cluster_out ,
                          collab_comm_nei_out ,
                          collab_jaccard_coef_out ,
                          collab_aa_out ,
                          collab_pa_out ,
                          temp_diff_out ,
                          journal_pop_out ,
                          same_journ_out ,
                          comm_auth_out ,
                          self_cite_out 
                         ]).T

# Preprocess the features
features_data = preprocessing.scale(features_data)
print(features_data.shape)

## IV. Running the model
We will run our model.
### Define train set and test set

In [None]:
X_train = features_data[:n_split]
y_train = [int(data[2]) for data in train_set]
X_test = features_data[n_split:]
y_test = [int(data[2]) for data in test_set]

### Sklearn classifiers
Run the following codes if want to use sklearn classifiers

In [None]:
# # Choose the classifier
# #classifier = svm.LinearSVC()
# #classifier = LogisticRegression()
# #classifier = AdaBoostClassifier()
# classifier = ExtraTreesClassifier()
# #classifier = RandomForestClassifier(n_jobs=1, n_estimators=500, criterion="entropy", max_features="log2", max_depth=10)

# # Fit the model
# start_time = time.time()
# classifier.fit(X_train,y_train)
# rtime = time.time() - start_time
# print("Training completed in ", rtime, " seconds.")

# # Export the result
# y_pred = list(classifier.predict(X_test))

# print("precision: ",accuracy_score(y_test, y_pred))
# print("f1 score: ",f1_score(y_test,y_pred , average='binary'))

### Using Keras NN model
In this case we use a sequenctial model of 3 hidden layers and activation function 'Relu', we use l2 regularizer.

In [None]:
# Define the model
n_features = features_data.shape[1]
n_h1 = 30
n_h2 = 30
n_h3 = 30

model = Sequential([
 Dense(output_dim=n_h1, input_dim=n_features, activation='relu', kernel_regularizer=regularizers.l2(0.00001)),
 Dense(output_dim=n_h2, input_dim=n_h1, activation='relu', kernel_regularizer=regularizers.l2(0.00001)),
 Dense(output_dim=n_h3, input_dim=n_h2, activation='relu', kernel_regularizer=regularizers.l2(0.00001)),
Dense(output_dim=1, input_dim=n_h3, activation='sigmoid'),
 ])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Set callback functions to early stop training and save the best model so far
callbacks = [#EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='keras_best_model.hdf5', monitor='val_acc', save_best_only=True)
            ]

# Fit the model
trained_model = model.fit(X_train, y_train, epochs=50, 
                          callbacks=callbacks, 
                          batch_size=128, validation_data=(X_test, y_test))

model.load_weights("keras_best_model.hdf5")

# Export the result
y_pred = np.rint(list(model.predict(X_test))).reshape((1,-1))[0].astype(int)

print("precision: ",accuracy_score(y_test, y_pred))
print("f1 score: ",f1_score(y_test,y_pred , average='binary'))

## V. Export the final result on the testing data to csv
### Get the features on the testing data

In [None]:
testing_data_features = np.array([
                          cosine_title_test_out ,
                          cosine_abstract_test_out ,
                          overlap_title_test_out ,
                          overlap_abstract_test_out ,
                          overlap_title_in_abstract_test_out ,
                          common_nei_test_out ,
                          jaccard_coef_test_out ,
                          aa_index_test_out ,
                          pref_att_test_out ,
                          btw_diff_test_out ,
                          in_link_test_out ,
                          num_cite_test_out ,
                          pr_source_test_out ,
                          pr_target_test_out ,
                          shortest_path_test_out ,
                          same_cluster_test_out ,
                          collab_comm_nei_test_out ,
                          collab_jaccard_coef_test_out ,
                          collab_aa_test_out ,
                          collab_pa_test_out ,
                          temp_diff_test_out ,
                          journal_pop_test_out ,
                          same_journ_test_out ,
                          comm_auth_test_out ,
                          self_cite_test_out 
                         ]).T

# Preprocess the features
testing_data_features = preprocessing.scale(testing_data_features)
print(testing_data_features.shape)

### Run the model on the features
#### For sklearn classifiers

In [None]:
# y_pred_final = list(classifier.predict(testing_data_features))

#### For Keras

In [None]:
y_pred_final = np.rint(list(model.predict(testing_data_features))).reshape((1,-1))[0].astype(int)

### Export the final result to the csv file 

In [None]:
final_prediction = zip(range(len(testing_data_features)), y_pred_final)
with open("output/my_predictions.csv","w") as pred1:
    csv_out = csv.writer(pred1)
    csv_out.writerow(["id","category"])
    for row in final_prediction:
        csv_out.writerow(row)