In [1]:
import csv
import time
import os
import random
import numpy as np
import pandas as pd
import pickle as pk
import networkx as nx
from sklearn.model_selection import train_test_split

#### Pose the problem as supervised learning classification problem
Generate bad links from graph that were not present in graph and has shortest path > 2 

In [2]:
###generating bad edges from given graph

if not os.path.isfile('data/missing_edges.p'):
    #getting all set of edges
    if not os.path.isfile('data/new_train.csv'):
        print("Please Run the first 3 cells of EDA.ipynb to Generate the required files for further processing")
    else:
        data = csv.reader(open('data/new_train.csv','r'))
        graph = nx.read_edgelist('data/new_train.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
    
        edges = dict()
        for edge in data:
            edges[(edge[0], edge[1])] = 1
        
        
        missing_edges = set([])
        while (len(missing_edges)<9437519):
            a=random.randint(1, 1862220)
            b=random.randint(1, 1862220)
            temp = edges.get((a,b),-1)
            if temp == -1 and a!=b:
                try:
                    if nx.shortest_path_length(graph,source=a,target=b) > 2: 
                        missing_edges.add((a,b))
                    else:
                        continue  
                except:  
                      missing_edges.add((a,b))              
            else:
                continue
        pk.dump(missing_edges,open('data/missing_edges.p','wb'))
else:
    missing_edges = pk.load(open('data/missing_edges.p','rb'))

##### Train and Test split

In [6]:
if (not os.path.isfile('data/train_orig.csv')) and (not os.path.isfile('data/test_orig.csv')):
    #reading total data df
    df_orig = pd.read_csv('data/train.csv')
    df_generated = pd.DataFrame(list(missing_edges), columns=['source_node', 'destination_node'])
    
    print("Number of nodes in the graph with edges", df_orig.shape[0])
    print("Number of nodes in the graph without edges", df_generated.shape[0])
    
    #Split data into Train: Test by 80:20 
    #split original links and genrated links seperatly because  
    #only original data is required for creating graph and feature generation
    x_train_orig, x_test_orig, y_train_orig, y_test_orig  = train_test_split(df_orig,np.ones(len(df_orig)),test_size=0.2, random_state=20)
    x_train_gener, x_test_gener, y_train_gener, y_test_gener  = train_test_split(df_generated,np.zeros(len(df_generated)),test_size=0.2, random_state=20)
    
    print('='*55)
    print("Number of nodes in the train data graph with edges", x_train_orig.shape[0],"=",y_train_orig.shape[0])
    print("Number of nodes in the train data graph without edges", x_train_gener.shape[0],"=", y_train_gener.shape[0])
    print('='*55)
    print("Number of nodes in the test data graph with edges", x_test_orig.shape[0],"=",y_test_orig.shape[0])
    print("Number of nodes in the test data graph without edges", x_test_gener.shape[0],"=",y_test_gener.shape[0])

    
    x_train_orig.to_csv('data/train_orig.csv',header=False, index=False)
    x_test_orig.to_csv('data/test_orig.csv',header=False, index=False)
    x_train_gener.to_csv('data/train_gener.csv',header=False, index=False)
    x_test_gener.to_csv('data/test_gener.csv',header=False, index=False)
    
else:
    #Graph from Traing data only 
    del missing_edges
    
    train_graph=nx.read_edgelist('data/train_orig.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
    test_graph=nx.read_edgelist('data/test_orig.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)
    print(nx.info(train_graph))
    print(nx.info(test_graph))

    # get unique nodes in test/train graphs
    train_nodes = set(train_graph.nodes(),'\n')
    test_nodes = set(test_graph.nodes(),'\n')

    com_people = len(train_nodes.intersection(test_nodes))
    only_train_people = len(train_nodes - test_nodes)
    only_test_people = len(test_nodes - train_nodes)

    print(com_people,'\n people common in train & test -- ')
    print(only_train_people,' people only present in train')

    print(only_test_people,' people present only in test')
    print('People in Test are {} %'.format(only_test_people/len(test_nodes)*100))

Number of nodes in the graph with edges 9437519
Number of nodes in the graph without edges 9437519
Number of nodes in the train data graph with edges 7550015 = 7550015
Number of nodes in the train data graph without edges 7550015 = 7550015
Number of nodes in the test data graph with edges 1887504 = 1887504
Number of nodes in the test data graph without edges 1887504 = 1887504


In [10]:
#final train and test data sets
if (not os.path.isfile('data/train_x.csv')) and (not os.path.isfile('data/test_x.csv')) and\
(not os.path.isfile('data/train_y.csv')) and (not os.path.isfile('data/test_y.csv')):

    X_train = x_train_orig.append(x_train_gener,ignore_index=True)
    y_train = np.concatenate((y_train_orig,y_train_gener))
    X_test = x_test_orig.append(x_test_gener,ignore_index=True)
    y_test = np.concatenate((y_test_orig,y_test_gener)) 
    
    print("Total data points in training data",X_train.shape)
    print("Total data points in testing data",X_test.shape)
    print("Shape of traget variable in train",y_train.shape)
    print("Shape of traget variable in test", y_test.shape)
    
    X_train.to_csv('data/train_x.csv',header=False,index=False)
    X_test.to_csv('data/test_x.csv',header=False,index=False)
    pd.DataFrame(y_train.astype(int)).to_csv('data/train_y.csv',header=False,index=False)
    pd.DataFrame(y_test.astype(int)).to_csv('data/test_y.csv',header=False,index=False)

Total data points in training data (15100030, 2)
Total data points in testing data (3775008, 2)
Shape of traget variable in train (15100030,)
Shape of traget variable in test (3775008,)
