In [0]:
import pandas as pd
import numpy as np
import csv
import random
import networkx as nx
from sklearn.linear_model import LogisticRegression
from operator import itemgetter

Section 1: Using Node IDs, Degree of each node pair and common neighbors between node pairs as features

In [0]:
#Code to Create Adjacency Matrix from training data.

all_nodes = []
matrix = np.zeros((4085,4085), dtype = int)
train = open("train.txt", "r")
for line in train:
  lis = line.split()
  all_nodes.append(int(lis[0]))
  for i in range(1, len(lis)):
    matrix[int(lis[0])][int(lis[i])] = 1
    matrix[int(lis[i])][int(lis[0])] = 1

np.savetxt('output.csv', matrix, delimiter=",")
all_nodes = list(set(all_nodes))
train.close()

In [70]:
print(len(all_nodes))
print(len(matrix))

4016
4085


In [0]:
#Read from training data as an adjacency list and add missing nodes manually
g = nx.read_adjlist("train.txt")
g.add_nodes_from([32, 1611, 2008, 2451, 2692, 2837])

In [0]:
#Extract all Negative Samples
negs = []
nodes = g.nodes
for i in range(len(matrix)):
  for j in range(i, len(matrix[i])):
    if i!= j and matrix[i][j] == 0 and str(i) in nodes and str(j) in nodes:
        negs.append([str(i), str(j), g.degree[str(i)], g.degree[str(j)], len(sorted(nx.common_neighbors(g, str(i), str(j))))])
negs = sorted(negs, key = itemgetter(4))

In [73]:
print(len(negs))

8035185


In [0]:
#Shuffle all negative samples
shuffled = random.sample(negs, len(negs))

In [0]:
#Choose around 20 times number of positive samples ~ 540K as negative samples after shuffling
negatives = pd.DataFrame.from_records(shuffled[:540000])

In [76]:
#Store all edge pairs from training data
edges = (g.edges)
print(len(edges))

26937


In [77]:
#Extracting Positive samples into a dataframe

lst = [[]*26937]
j = 0
for i in edges:
  lst.append([])
  node1 = i[0]
  node2 = i[1]
  lst[j].extend([node1, node2, g.degree[node1], g.degree[node2], len(sorted(nx.common_neighbors(g, node1, node2)))])
  j+=1
positives = pd.DataFrame.from_records(lst)
positives.drop(26937,inplace = True)
positives

Unnamed: 0,0,1,2,3,4
0,0,2860,2.0,2.0,1.0
1,0,3117,2.0,3.0,1.0
2,2860,3117,2.0,3.0,1.0
3,3117,2302,3.0,5.0,0.0
4,1,318,12.0,6.0,4.0
...,...,...,...,...,...
26932,2643,2950,2.0,2.0,1.0
26933,2678,3501,2.0,2.0,1.0
26934,2753,3443,3.0,3.0,2.0
26935,2991,3991,2.0,3.0,0.0


In [78]:
#Concatenate positive and negative samples into a single data frame
data = pd.concat([positives, negatives], ignore_index=True)
data

Unnamed: 0,0,1,2,3,4
0,0,2860,2.0,2.0,1.0
1,0,3117,2.0,3.0,1.0
2,2860,3117,2.0,3.0,1.0
3,3117,2302,3.0,5.0,0.0
4,1,318,12.0,6.0,4.0
...,...,...,...,...,...
566932,2394,3592,4.0,19.0,0.0
566933,1591,3330,6.0,8.0,0.0
566934,1628,2063,10.0,34.0,0.0
566935,1432,1739,34.0,5.0,0.0


In [79]:
#Manually creating labels for positive and negative samples
pos = ['1']*26937
neg = ['0']*540000
total = pos + neg

labels = pd.DataFrame.from_records(total)
labels

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
566932,0
566933,0
566934,0
566935,0


In [80]:
#Fitting the data to the logistic regression model
lr = LogisticRegression(class_weight="balanced")
lr.fit(data, labels)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [81]:
#Reading test data

with open('test-public.csv', newline='') as f:
    reader = csv.reader(f)
    test_data = list(reader)
test_data.pop(0)
print(test_data)

[['1', '1', '4021'], ['2', '2', '3795'], ['3', '3', '1627'], ['4', '4', '2206'], ['5', '4', '2575'], ['6', '4', '3952'], ['7', '5', '1262'], ['8', '5', '2717'], ['9', '6', '3305'], ['10', '6', '3880'], ['11', '9', '424'], ['12', '9', '1561'], ['13', '9', '4080'], ['14', '10', '3993'], ['15', '22', '249'], ['16', '22', '705'], ['17', '22', '2238'], ['18', '22', '2897'], ['19', '22', '4059'], ['20', '24', '693'], ['21', '24', '772'], ['22', '25', '240'], ['23', '26', '1135'], ['24', '28', '3847'], ['25', '29', '1624'], ['26', '30', '811'], ['27', '32', '2671'], ['28', '32', '2977'], ['29', '35', '3145'], ['30', '37', '2740'], ['31', '38', '1032'], ['32', '45', '543'], ['33', '45', '727'], ['34', '45', '1399'], ['35', '45', '1580'], ['36', '45', '2194'], ['37', '45', '2593'], ['38', '45', '2713'], ['39', '45', '3370'], ['40', '45', '3542'], ['41', '45', '3815'], ['42', '46', '1382'], ['43', '47', '2963'], ['44', '49', '358'], ['45', '51', '201'], ['46', '51', '428'], ['47', '51', '1222'],

In [82]:
#Adding corresponding features for test data
for i in test_data:
  node1 = i[1]
  node2 = i[2]

  if g.has_node(node1):
    i.append(g.degree[node1])
  else:
      i.append(0)
  if g.has_node(node2):
    i.append(g.degree[node2])
  else:
    i.append(0)
  
  if g.has_node(node1) and g.has_node(node2):
    i.append(len(sorted(nx.common_neighbors(g, node1, node2))))
  else:
    i.append(0)
  
print(test_data)

[['1', '1', '4021', 12, 3, 1], ['2', '2', '3795', 6, 51, 3], ['3', '3', '1627', 15, 5, 0], ['4', '4', '2206', 74, 3, 2], ['5', '4', '2575', 74, 3, 1], ['6', '4', '3952', 74, 39, 3], ['7', '5', '1262', 8, 78, 4], ['8', '5', '2717', 8, 62, 6], ['9', '6', '3305', 47, 6, 2], ['10', '6', '3880', 47, 27, 1], ['11', '9', '424', 47, 11, 3], ['12', '9', '1561', 47, 93, 2], ['13', '9', '4080', 47, 2, 0], ['14', '10', '3993', 55, 29, 4], ['15', '22', '249', 72, 48, 4], ['16', '22', '705', 72, 67, 51], ['17', '22', '2238', 72, 149, 52], ['18', '22', '2897', 72, 3, 0], ['19', '22', '4059', 72, 41, 1], ['20', '24', '693', 7, 5, 0], ['21', '24', '772', 7, 19, 2], ['22', '25', '240', 3, 8, 0], ['23', '26', '1135', 92, 46, 2], ['24', '28', '3847', 12, 9, 0], ['25', '29', '1624', 9, 3, 3], ['26', '30', '811', 12, 13, 0], ['27', '32', '2671', 0, 99, 0], ['28', '32', '2977', 0, 9, 0], ['29', '35', '3145', 2, 3, 0], ['30', '37', '2740', 4, 9, 2], ['31', '38', '1032', 13, 8, 0], ['32', '45', '543', 102, 55,

In [83]:
#Putting Test data vectors into a data frame
test = pd.DataFrame.from_records(test_data)
test.drop(0, inplace = True, axis = 1)
test.reset_index
test

Unnamed: 0,1,2,3,4,5
0,1,4021,12,3,1
1,2,3795,6,51,3
2,3,1627,15,5,0
3,4,2206,74,3,2
4,4,2575,74,3,1
...,...,...,...,...,...
1995,3864,3970,14,43,0
1996,3884,3921,25,9,0
1997,3946,3956,10,8,0
1998,3956,3969,8,5,1


In [0]:
#Calculating predictions as probabilities on the test data using created model
predictions = lr.predict_proba(test)

In [0]:
#Extracting predictions of the 1's class
predict = predictions[:,1].tolist()

In [0]:
#Creating IDs for output file
id = []
for i in range(1,2001):
  id.append(i)

In [87]:
#Putting predictions into a dataframe
pred = pd.DataFrame()
pred["Id"] = id
pred["Predicted"] = predict
pred

Unnamed: 0,Id,Predicted
0,1,0.160109
1,2,0.871417
2,3,0.048415
3,4,0.946296
4,5,0.391692
...,...,...
1995,1996,0.017066
1996,1997,0.108700
1997,1998,0.113085
1998,1999,0.755733


In [0]:
#Writing predictions data frame to output file
pred.to_csv("preds.csv", index = False)

Section 2: Using features from nodes.json as added features to above model

In [0]:
#Reading from nodes.json
import json
with open('nodes.json') as f:
  features = json.load(f)

In [90]:
#Creating a revised dictionary with keys as the IDs
reformed = {}
for i in features:
  reformed[str(i['id'])] = i
print(reformed)

{'0': {'first': 4, 'id': 0, 'keyword_0': 1, 'keyword_10': 1, 'keyword_13': 1, 'keyword_15': 1, 'keyword_2': 1, 'keyword_33': 1, 'keyword_34': 1, 'keyword_37': 1, 'keyword_38': 1, 'keyword_41': 1, 'keyword_43': 1, 'keyword_46': 1, 'keyword_49': 1, 'keyword_50': 1, 'keyword_52': 1, 'keyword_6': 1, 'keyword_8': 1, 'last': 0, 'num_papers': 6, 'venue_13': 1, 'venue_281': 1, 'venue_5': 1, 'venue_87': 1}, '1': {'first': 9, 'id': 1, 'keyword_0': 1, 'keyword_1': 1, 'keyword_10': 1, 'keyword_11': 1, 'keyword_13': 1, 'keyword_15': 1, 'keyword_17': 1, 'keyword_18': 1, 'keyword_2': 1, 'keyword_26': 1, 'keyword_27': 1, 'keyword_29': 1, 'keyword_31': 1, 'keyword_36': 1, 'keyword_37': 1, 'keyword_4': 1, 'keyword_40': 1, 'keyword_41': 1, 'keyword_42': 1, 'keyword_43': 1, 'keyword_46': 1, 'keyword_48': 1, 'keyword_49': 1, 'keyword_6': 1, 'keyword_7': 1, 'keyword_8': 1, 'last': 1, 'num_papers': 16, 'venue_1': 1, 'venue_13': 1, 'venue_2': 1, 'venue_27': 1, 'venue_40': 1, 'venue_5': 1, 'venue_57': 1, 'venu

In [0]:
#Creating a list of vectors of 0-52 keywords for each author
keywords = []
j = 0
for i in reformed.keys():
  keywords.append([])
  for k in range(53):
    if (('keyword_' + str(k)) in reformed[i]):
      keywords[j].append(1)
    else:
      keywords[j].append(0)
  j+=1    


In [0]:
#Creating a list of vectors of 0-347 venues for each author
venues = []
j = 0
for i in reformed.keys():
  venues.append([])
  for k in range(348):
    if (("venue_" + str(k)) in reformed[i]):
      venues[j].append(1)
    else:
      venues[j].append(0)
  j+=1    


In [0]:
#Positive samples with common keywords and common venues as added features

lst = [[]*26937]
j = 0
for i in edges:
  Vcount = 0
  Kcount = 0
  lst.append([])
  node1 = i[0]
  node2 = i[1]
  lst[j].extend([node1, node2, g.degree[node1], g.degree[node2], len(sorted(nx.common_neighbors(g, node1, node2)))])
  
  for k in range(348):
    if venues[int(node1)][k] == 1 and venues[int(node2)][k] == 1:
      Vcount += 1
  
  for k in range(53):
    if keywords[int(node1)][k] == 1 and keywords[int(node2)][k] == 1:
      Kcount += 1

  lst[j].append(Kcount)
  lst[j].append(Vcount)
  
  j+=1
positives = pd.DataFrame.from_records(lst)
positives.drop(26937,inplace = True)


In [0]:
#Negative samples with Common Keywords and Common Venues as added features

negs = []
nodes = g.nodes
for i in range(len(matrix)):
  for j in range(i, len(matrix[i])):
    if i!= j and matrix[i][j] == 0 and str(i) in nodes and str(j) in nodes:
      Vcount = 0
      Kcount = 0
      
      for k in range(348):
        if venues[i][k] == 1 and venues[j][k] == 1:
          Vcount += 1

      for k in range(53):
        if keywords[i][k] == 1 and keywords[j][k] == 1:
          Kcount += 1
      

      negs.append([str(i), str(j), g.degree[str(i)], g.degree[str(j)], len(sorted(nx.common_neighbors(g, str(i), str(j)))), Kcount, Vcount])
negs = sorted(negs, key = itemgetter(4))

In [95]:
shuffled = random.sample(negs, len(negs))
print(len(shuffled))

8035185


In [0]:
negatives = pd.DataFrame.from_records(shuffled[:540000])

In [97]:
data = pd.concat([positives, negatives], ignore_index=True)
data

Unnamed: 0,0,1,2,3,4,5,6
0,0,2860,2.0,2.0,1.0,14.0,1.0
1,0,3117,2.0,3.0,1.0,11.0,2.0
2,2860,3117,2.0,3.0,1.0,9.0,1.0
3,3117,2302,3.0,5.0,0.0,16.0,5.0
4,1,318,12.0,6.0,4.0,17.0,3.0
...,...,...,...,...,...,...,...
566932,1456,3515,12.0,19.0,0.0,15.0,3.0
566933,269,2058,2.0,13.0,0.0,10.0,0.0
566934,661,3117,7.0,3.0,0.0,13.0,0.0
566935,1340,1470,2.0,14.0,0.0,15.0,1.0


In [98]:
#Reading Test data
with open('test-public.csv', newline='') as f:
    reader = csv.reader(f)
    test_data = list(reader)
test_data.pop(0)
print(test_data)

[['1', '1', '4021'], ['2', '2', '3795'], ['3', '3', '1627'], ['4', '4', '2206'], ['5', '4', '2575'], ['6', '4', '3952'], ['7', '5', '1262'], ['8', '5', '2717'], ['9', '6', '3305'], ['10', '6', '3880'], ['11', '9', '424'], ['12', '9', '1561'], ['13', '9', '4080'], ['14', '10', '3993'], ['15', '22', '249'], ['16', '22', '705'], ['17', '22', '2238'], ['18', '22', '2897'], ['19', '22', '4059'], ['20', '24', '693'], ['21', '24', '772'], ['22', '25', '240'], ['23', '26', '1135'], ['24', '28', '3847'], ['25', '29', '1624'], ['26', '30', '811'], ['27', '32', '2671'], ['28', '32', '2977'], ['29', '35', '3145'], ['30', '37', '2740'], ['31', '38', '1032'], ['32', '45', '543'], ['33', '45', '727'], ['34', '45', '1399'], ['35', '45', '1580'], ['36', '45', '2194'], ['37', '45', '2593'], ['38', '45', '2713'], ['39', '45', '3370'], ['40', '45', '3542'], ['41', '45', '3815'], ['42', '46', '1382'], ['43', '47', '2963'], ['44', '49', '358'], ['45', '51', '201'], ['46', '51', '428'], ['47', '51', '1222'],

In [99]:
#Adding corresponding features to test data
for i in test_data:
  node1 = i[1]
  node2 = i[2]
  Vcount = 0
  Kcount = 0
  if g.has_node(node1):
    i.append(g.degree[node1])
  else:
      i.append(0)
  if g.has_node(node2):
    i.append(g.degree[node2])
  else:
    i.append(0)
  if g.has_node(node1) and g.has_node(node2):
    i.append(len(sorted(nx.common_neighbors(g, node1, node2))))
  else:
    i.append(0)
  
  for k in range(53):
    if keywords[int(node1)][k] == 1 and keywords[int(node2)][k] == 1:
      Kcount += 1

  for k in range(348):
    if venues[int(node1)][k] == 1 and venues[int(node2)][k] == 1:
      Vcount += 1

  i.append(Kcount)
  i.append(Vcount)
  
print(test_data)

[['1', '1', '4021', 12, 3, 1, 14, 1], ['2', '2', '3795', 6, 51, 3, 11, 0], ['3', '3', '1627', 15, 5, 0, 4, 3], ['4', '4', '2206', 74, 3, 2, 17, 4], ['5', '4', '2575', 74, 3, 1, 13, 2], ['6', '4', '3952', 74, 39, 3, 33, 10], ['7', '5', '1262', 8, 78, 4, 11, 2], ['8', '5', '2717', 8, 62, 6, 9, 2], ['9', '6', '3305', 47, 6, 2, 12, 1], ['10', '6', '3880', 47, 27, 1, 18, 0], ['11', '9', '424', 47, 11, 3, 30, 5], ['12', '9', '1561', 47, 93, 2, 30, 2], ['13', '9', '4080', 47, 2, 0, 11, 2], ['14', '10', '3993', 55, 29, 4, 10, 2], ['15', '22', '249', 72, 48, 4, 28, 5], ['16', '22', '705', 72, 67, 51, 31, 7], ['17', '22', '2238', 72, 149, 52, 32, 9], ['18', '22', '2897', 72, 3, 0, 7, 2], ['19', '22', '4059', 72, 41, 1, 32, 7], ['20', '24', '693', 7, 5, 0, 6, 1], ['21', '24', '772', 7, 19, 2, 11, 3], ['22', '25', '240', 3, 8, 0, 5, 0], ['23', '26', '1135', 92, 46, 2, 21, 6], ['24', '28', '3847', 12, 9, 0, 16, 3], ['25', '29', '1624', 9, 3, 3, 12, 1], ['26', '30', '811', 12, 13, 0, 16, 3], ['27', 

In [100]:
#Adding test data to a dataframe
test = pd.DataFrame.from_records(test_data)
test.drop(0, inplace = True, axis = 1)
test.reset_index

<bound method DataFrame.reset_index of          1     2   3   4  5   6  7
0        1  4021  12   3  1  14  1
1        2  3795   6  51  3  11  0
2        3  1627  15   5  0   4  3
3        4  2206  74   3  2  17  4
4        4  2575  74   3  1  13  2
...    ...   ...  ..  .. ..  .. ..
1995  3864  3970  14  43  0   9  2
1996  3884  3921  25   9  0  10  3
1997  3946  3956  10   8  0  12  0
1998  3956  3969   8   5  1  13  1
1999  4024  4059  24  41  0  18  3

[2000 rows x 7 columns]>

In [101]:
#Fitting the feature vectors to the logistic regression model and extracting predictions of 1's class
lr = LogisticRegression(class_weight="balanced", verbose = True)
lr.fit(data, labels)
predictions = lr.predict_proba(test)
predict = predictions[:,1].tolist()


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.1s finished


In [0]:
#Creating output prediction file
pred = pd.DataFrame()
pred["Id"] = id
pred["Predicted"] = predict
pred
pred.to_csv("preds.csv", index = False)