In [1]:
import networkx as nx
# https://networkx.github.io/documentation/stable/reference/index.html
import matplotlib.pyplot as plt
from matplotlib import pylab
import numpy as np
import pandas as pd
from collections import Counter

# Which nodes' profile do we have to fill?

In [4]:
# Sub-problem: fill the profiles of 'empty' nodes
df_empty = pd.read_csv('./challenge1/empty.csv', sep='\t', header='infer')


In [5]:
df_empty.shape

(475, 1)

In [6]:
# Get a list from a Series
empty = df_empty['name'].drop_duplicates().tolist()
len(empty)

475

# Profil data understanding

In [23]:
# We use pandas dataframes to load attributes
# Nodes are characterized by a list of colleges, a list of employers, a list of location
# The order means nothing. Nothing allow us to determine the current employer...
df_e = pd.read_csv('./challenge1/employer_with_60percent_of_nodes_remoded.csv', sep='\t', header='infer')
df_e.head()

Unnamed: 0,name,employer
0,U1313,discovery education
1,U1313,ctb mcgraw-hill
2,U1313,university of charleston university of south c...
3,U1313,south carolina department of education
4,U1313,chesapeake va and sumter sc


In [12]:
len(df_e[['name']])

923

In [13]:
# How many users have 1 and more declared employers?
len(df_e[['name']].drop_duplicates())

297

In [15]:
# load the graph
# The graph is an extraction from LinkedIn Social Network
G = nx.read_gexf("./challenge1/mediumLinkedin.gexf")
print("%d nodes have no employer attributes among the %d users in the graph" % (G.number_of_nodes()-len(df_e[['name']].drop_duplicates()), G.number_of_nodes()))

514 nodes have no employer attributes among the 811 users in the graph


In [16]:
# How many employers / user?
df_e.groupby('name').count().describe()

Unnamed: 0,employer
count,297.0
mean,3.107744
std,1.976682
min,1.0
25%,1.0
50%,3.0
75%,4.0
max,13.0


In [17]:
df_e['employer'].value_counts()

university of illinois at urbana-champaign                                       76
microsoft                                                                        15
google                                                                           15
measured progress                                                                 5
university of texas at austin                                                     5
amazon                                                                            5
university of illinois at chicago                                                 5
oracle                                                                            4
ets                                                                               4
state farm insurance                                                              4
microsoft research asia                                                           4
educational testing service                                                 

In [18]:
# Boolean row selection by values in a column
df_e.loc[df_e['employer'].isin(['google','google inc']),:].head()


Unnamed: 0,name,employer
30,U3895,google
126,U15289,google
136,U24080,google
145,U24046,google
161,U7151,google


In [31]:
new = df_e.loc[df_e['employer'].isin(['google','google inc']),:]
print(new)

       name employer
30    U3895   google
126  U15289   google
136  U24080   google
145  U24046   google
161   U7151   google
172  U27661   google
179  U24154   google
229   U8699   google
314   U2761   google
451   U2631   google
720   U8721   google
742  U14131   google
771   U4568   google
831  U27494   google
860  U27758   google


# Exploration of the graph : relational data understanding, focus on homophily

In [19]:
# networkx short summary of information for the graph g
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 811
Number of edges: 1597
Average degree:   3.9383


### Homophily understanding
Test here the homophily in the graph for each attribute

In [33]:
for n in G.nodes(): # Loop through every node, in our data "n" will be the name of the person
    print(n, G.node[n]) # Access every node by its name, and then by the attribute "birth_year"


U27476 {'label': 'U27476'}
U4665 {'label': 'U4665'}
U1313 {'label': 'U1313'}
U8804 {'label': 'U8804'}
U14078 {'label': 'U14078'}
U9628 {'label': 'U9628'}
U9721 {'label': 'U9721'}
U2649 {'label': 'U2649'}
U27759 {'label': 'U27759'}
U7310 {'label': 'U7310'}
U22859 {'label': 'U22859'}
U1417 {'label': 'U1417'}
U25611 {'label': 'U25611'}
U24095 {'label': 'U24095'}
U3895 {'label': 'U3895'}
U13005 {'label': 'U13005'}
U2620 {'label': 'U2620'}
U5976 {'label': 'U5976'}
U4249 {'label': 'U4249'}
U9140 {'label': 'U9140'}
U16112 {'label': 'U16112'}
U27776 {'label': 'U27776'}
U27708 {'label': 'U27708'}
U15284 {'label': 'U15284'}
U24549 {'label': 'U24549'}
U18514 {'label': 'U18514'}
U15272 {'label': 'U15272'}
U3933 {'label': 'U3933'}
U15946 {'label': 'U15946'}
U25630 {'label': 'U25630'}
U25632 {'label': 'U25632'}
U22031 {'label': 'U22031'}
U11562 {'label': 'U11562'}
U22071 {'label': 'U22071'}
U15292 {'label': 'U15292'}
U7912 {'label': 'U7912'}
U22018 {'label': 'U22018'}
U18925 {'label': 'U18925'}
U713

# Strategy 0 to fill the empty profiles = your baseline: naive method

The assumption is that two connected nodes are likely to share the same attribute values. Here we choose the most frequently used attribute value among the neighbors.

In [35]:
naive_predicted_values={}

In [155]:
def naive_method(graph, empty, df):
    """   Predict the missing attribute with a simple but effective
    relational classifier. 
    
    The assumption is that two connected nodes are 
    likely to share the same attribute value. Here we chose the most frequently
    used attribute by the neighbors
    
    Parameters
    ----------
    graph : graph
       A networkx graph
    empty : list
       The nodes with empty attributes 
    df : pandas dataframe 
       Either location, employer or college dataframe. 

    Returns
    -------
    predicted_values : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node (from empty), value is a list of attribute values. Here 
       only 1 value in the list.
     """
    predicted_values={}
    for n in empty:
        nbrs_attr_values=[] 
        for nbr in graph.neighbors(n):
            # attribute values for the node nbr
            attr = df_e.loc[df_e.name == nbr ,'employer'].values
            
            if attr.size > 0: #array not empty
                for val in attr:
                    nbrs_attr_values.append(val)
        predicted_values[n]=[]
        if nbrs_attr_values: # non empty list
            # count the number of occurrence each value and returns a dict
            cpt=Counter(nbrs_attr_values)
            
            # take the most represented attribute value among neighbors
            a,nb_occurrence=max(cpt.items(), key=lambda t: t[1])
            #print(a,nb_occurrence)
            predicted_values[n].append(a)
    return predicted_values


In [156]:
naive_predicted_values = naive_method(G, empty, df_e)

In [157]:
print(naive_predicted_values)

{'U27476': ['google'], 'U4665': [], 'U14078': ['university of illinois at urbana-champaign'], 'U9628': ['state farm'], 'U9721': ['champaign-urbana community fabrication laboratory'], 'U27759': [], 'U24095': ['stanford university'], 'U4249': ['copper creek church'], 'U27776': [], 'U27708': [], 'U15284': ['university of illinois at urbana-champaign'], 'U24549': [], 'U18514': ['ultimate software'], 'U15946': ['boston consulting group'], 'U25630': ['measured progress'], 'U25632': ['measured progress'], 'U22031': [], 'U7912': ['university of illinois at urbana-champaign'], 'U22018': [], 'U18925': ['university of illinois at urbana-champaign'], 'U7136': [], 'U3927': [], 'U1381': ['ets'], 'U22087': [], 'U27804': [], 'U7355': ['siemens healthcare'], 'U14024': ['university of illinois at urbana-champaign'], 'U27679': [], 'U27464': [], 'U24259': [], 'U18564': [], 'U11575': ["noah's ark lab huawei technologies co. ltd."], 'U14507': ['merge llc'], 'U27477': ['baidu inc.'], 'U24143': [], 'U22015': 

# Strategy 1 to fill the empty profiles

In [149]:
# use the information of neighbor's neighbors
def method1(graph, empty, df):
    predicted_values={}
    for n in empty:
        nbrs_attr_values={}
        
        for nbr in graph.neighbors(n):
            # attribute values for the node nbr
            attr = df_e.loc[df_e.name == nbr ,'employer'].values
            if attr.size > 0: #array not empty
                for val in attr:
                    if val not in nbrs_attr_values:
                        nbrs_attr_values[val] = 1
                    else:
                        nbrs_attr_values[val] += 1
            
            for nnbr in graph.neighbors(nbr):
                attr = df_e.loc[df_e.name == nnbr ,'employer'].values
                
                if attr.size > 0: #array not empty
                    for val in attr:
                        if val in nbrs_attr_values:
                            nbrs_attr_values[val] += 0.5
                            #print(nbrs_attr_values)
                                 
        predicted_values[n]=[]
        if nbrs_attr_values: # non empty list
            # count the number of occurrence each value and returns a dict
            cpt=Counter(nbrs_attr_values)
            
            # take the most represented attribute value among neighbors
            a,nb_occurrence=max(cpt.items(), key=lambda t: t[1])
            #print(nb_occurrence)
            predicted_values[n].append(a)
    return predicted_values

In [150]:
predicted_values1 = method1(G, empty, df_e)

# Strategy 2 to fill the empty profiles

In [169]:
def method2(graph, empty, df):
    predicted_values={}
    for n in empty:
        nbrs_attr_values={}
        
        for nbr in graph.neighbors(n):
            # attribute values for the node nbr
            attr = df_e.loc[df_e.name == nbr ,'employer'].values
            if attr.size > 0: #array not empty
                for val in attr:
                    if val not in nbrs_attr_values:
                        nbrs_attr_values[val] = 1
                    else:
                        nbrs_attr_values[val] += 1
            
            for nnbr in graph.neighbors(nbr):
                attr = df_e.loc[df_e.name == nnbr ,'employer'].values
                
                if attr.size > 0: #array not empty
                    for val in attr:
                        if val in nbrs_attr_values:
                            nbrs_attr_values[val] += 0.5
                            #print(nbrs_attr_values)
                        
        
            
                
        predicted_values[n]=[]
        if nbrs_attr_values: # non empty list
            # count the number of occurrence each value and returns a dict
            cpt=Counter(nbrs_attr_values)
            
            # take the most represented attribute value among neighbors
            L = sorted(cpt.items(),key=lambda item:item[1],reverse=True)[:2]
            #a,nb_occurrence=max(cpt.items(), key=lambda t: t[1])
            #print(nb_occurrence)
            a = []
            for l in L:
                a.append(l[0]) 
            predicted_values[n].append(a)
    return predicted_values

In [170]:
predicted_values2 = method2(G, empty, df_e)

# Strategy 3


In [174]:
def method3(graph, empty, df):

    predicted_values={}
    for n in empty:
        nbrs_attr_values=[] 
        for nbr in graph.neighbors(n):
            # attribute values for the node nbr
            attr = df_e.loc[df_e.name == nbr ,'employer'].values
            
            if attr.size > 0: #array not empty
                for val in attr:
                    nbrs_attr_values.append(val)
        predicted_values[n]=[]
        if nbrs_attr_values: # non empty list
            # count the number of occurrence each value and returns a dict
            cpt=Counter(nbrs_attr_values)
                       
            L = sorted(cpt.items(),key=lambda item:item[1],reverse=True)
            
            a = []
            for i,l in enumerate(L):
                print(l)
                while l[1] > 1:
                    a.append(l[0]) 
                    
            #print(a)
            predicted_values[n].append(a)
            
            
            
            
    return predicted_values


In [None]:
predicted_values3 = method3(G, empty, df_e)

('google', 1)
('at&amp;t labs inc.', 1)
('university of michigan', 1)
('uc san diego', 1)
('cisco systems', 1)
('qad inc', 1)
[]
('university of illinois at urbana-champaign', 1)
('software competitiveness international', 1)
('wexgroup', 1)
('intered', 1)
[]
('state farm', 1)
('illinois business consulting', 1)
('argonne national laboratory', 1)
('san diego gas &amp; electric', 1)
('university of illinois at urbana-champaign', 1)
('technische universit&#xe4;t darmstadt', 1)
('kraft foods', 1)
[]
('champaign-urbana community fabrication laboratory', 1)
('manplan inc', 1)
[]
('stanford university', 1)
[]
('copper creek church', 1)
('cv lloyde', 1)
('sony biotechnology inc.', 1)
('merge llc', 1)
('merge partners inc', 1)
('vineyard church', 1)
('state farm insurance', 1)
('hearts at home', 1)
('shapemaster inc.', 1)
('vizion interactive', 1)
('search fanatics', 1)
('click2rank consulting llc', 1)
('seo by jt', 1)
('innovadex', 1)
('church of harvest', 1)
('t-mobile', 1)
('mercy &amp; trut

# Evaluation


In [105]:
df_e_truth = pd.read_csv('./challenge1/groundtruth/employer.csv', sep='\t', header='infer')
df_e_truth.head()

Unnamed: 0,name,employer
0,U21998,illinois college advising corps
1,U21998,victoria amplifiers
2,U21998,university of illinois at chicago
3,U27476,ibm
4,U27476,nyse euronext


In [106]:
df_e_truth.loc[df_e_truth.name == 'U21998',df_e_truth.columns[1]].values

array(['illinois college advising corps', 'victoria amplifiers',
       'university of illinois at chicago'], dtype=object)

In [107]:
def evaluation_accuracy(groundtruth, pred):
    """    Compute the accuracy of your model.

     The accuracy is the proportion of true results.

    Parameters
    ----------
    groundtruth : pandas dataframe 
       Either location, employer or college dataframe. 
    pred : dict 
       A dict of attributes, either location, employer or college attributes. 
       key is a node, value is a list of attribute values. 

    Returns
    -------
    out : float
       Accuracy.
    """
    true_positive_prediction=0
    predicted=0
    for p_key, p_value in pred.items():
        attr = groundtruth.loc[groundtruth.name == p_key ,groundtruth.columns[1]].values
        # if prediction is empty, e.g. [], and so is the groundtruth
        # May happen, we count it as a true prediction
        if not p_value and attr.size == 0:
            true_positive_prediction+=1
            predicted+=1
        # counts the number of good prediction for node p_key
        # here len(p_value)=1 but we could have tried to predict more values
        # if p_value == [], we add 0
        true_positive_prediction += len([c for c in p_value if c in attr])
        predicted += len([c for c in p_value])
    return true_positive_prediction*100/predicted
 

In [63]:
result=evaluation_accuracy(df_e_truth, naive_predicted_values)
print("%f%% of the predictions are true" % result)
print("Very poor result!!! Try to do better!!!!")

27.397260% of the predictions are true
Very poor result!!! Try to do better!!!!


In [151]:
result=evaluation_accuracy(df_e_truth, predicted_values1)
print("%f%% of the predictions are true" % result)
print("Very poor result!!! Try to do better!!!!")

32.534247% of the predictions are true
Very poor result!!! Try to do better!!!!


In [171]:
result=evaluation_accuracy(df_e_truth, predicted_values2)
print("%f%% of the predictions are true" % result)
print("Very poor result!!! Try to do better!!!!")



19.863014% of the predictions are true
Very poor result!!! Try to do better!!!!


In [160]:
result=evaluation_accuracy(df_e_truth, predicted_values3)
print("%f%% of the predictions are true" % result)
print("Very poor result!!! Try to do better!!!!")

27.397260% of the predictions are true
Very poor result!!! Try to do better!!!!
