# Social Graph

[New York Social Diary](http://www.newyorksocialdiary.com/) provides a
fascinating lens onto New York's socially well-to-do.  The data forms a natural
social graph for New York's social elite.  

In [1]:
import csv
import re
import networkx as nx
import pandas as pd

In [2]:
def create_edge_tuple(List):
    a = []
    for x in List:
        for y in List:
            if x !=y:
                a.append(tuple(list(set([x,y]))))
        List.remove(x)
    return [item for item in list(set(a))]

In [3]:
input_file = csv.DictReader(open('captiondata1.csv'))

caption_list = []
name_sublist = []
name_list = []
titles = ['Mr.', 'Mrs.', 'Ms.', 'Dr.', 'Mayor', 'CEO', 'M.D.', 'AMC', 'AOHT','ANDRUS', 'AOHT', 'ASF', 'ASPCA',\
         'ACO', 'ACC', 'ABT', 'ACandC', 'AFIPO', 'ALS', 'ALSGNY', 'AAADTs', 'AIA', 'AIS',\
         'Actress', 'Actresses', 'Actor', 'Actors', 'Author', 'Authors', 'Bad', 'C.B.E.', 'COO'\
         'Board Member','Photographs','Benefit Chairman', 'Benefit Chairmen', 'Benefit Chairs','CCBF Chairman',\
         'CCBF Doctors', 'CNBC', 'CUNY Chancellor', 'CSHL President', 'CSUN President', 'President', \
         'Vice President',  'Cardiologist', 'Miss New York', 'New York', 'COO'\
          'Board Member','Photographs','Benefit Chairman', 'Benefit Chairmen', 'Benefit Chairs','CCBF Chairman',\
         'CCBF Doctors', 'CNBC', 'CUNY Chancellor', 'CSHL President', 'CSUN President', 'President', \
         'Vice President',  'Cardiologist', 'Miss New York', 'New York', 'COO'
         ]

for row in input_file:    
    caption = row["caption"].split('%')
    for caption_item in caption:
        if len(caption_item)<250:
            caption_item = caption_item.decode('utf-8').strip().replace('\n',' ').replace('\t',' ')
            for word in titles:
                if word in caption_item:
                    caption_item = re.sub(word,'',caption_item)
            caption_item = re.sub('[^A-Za-z\,\& \.]+', ' ', caption_item)      # remove all the special characters
            split_list = re.split(',|and |with |& ',caption_item)                    
            name_sublist = filter(None, split_list)
            name_sublist = [item.strip() for item in name_sublist]
            # remove whitespaces strings
            name_sublist_filter = filter(lambda name: name.strip() and len(name.split(' '))<=4 and name[0].isupper(),name_sublist) 
            if name_sublist_filter:
                # deal with husband and wife case
                new_list = []
                c = []
                for item in name_sublist_filter:
                    if len(item.split(' ')) ==1:
                        new_list.append(item)
                        #print new_list
                        continue
                    else:
                        last_name = item.split(' ')[-1]
                        b = [first_name+" "+last_name for first_name in new_list]
                        new_list = []
                        c.extend(b)
                        c.append(item)
                name_list.append(c)
                #print name_sublist_filter
        caption_list.append(caption_item)

In [4]:
#Draw a Graph
G = nx.Graph()
node_list = [item for x in name_list for item in x]
new_node_list = list(set(node_list))
G.add_nodes_from(new_node_list)
list_tuple = []
for item in name_list:
    a = create_edge_tuple(item)
    for x in a:
        list_tuple.append(x)
#print list_tuple
for node_pair in list_tuple:
    if G.has_edge(node_pair[0],node_pair[1]):
        G[node_pair[0]][node_pair[1]]['weight']+=1            
    else:
        G.add_edge(node_pair[0],node_pair[1],weight = 1)

## 1. degree
The simplest question to ask is "who is the most popular"?  The easiest way to
answer this question is to look at how many connections everyone has.  Return
the top 100 people and their degree.  Remember that if an edge of the graph has
weight 2, it counts for 2 in the degree.

In [5]:
# Question 1
degree_dict = G.degree()
table_degree = pd.Series(degree_dict)
sorted_table_degree = table_degree.order(ascending = False)
sorted_list_degree = []
for i in range(0,100):
    Index = sorted_table_degree.index[i]
    sorted_list_degree.append((str(sorted_table_degree.index[i]),sorted_table_degree[Index])) 

## 2. pagerank
A similar way to determine popularity is to look at their
[pagerank](http://en.wikipedia.org/wiki/PageRank).  Pagerank is used for web
ranking and was originally
[patented](http://patft.uspto.gov/netacgi/nph-Parser?patentnumber=6285999) by
Google and is essentially the stationary distribution of a [markov
chain](http://en.wikipedia.org/wiki/Markov_chain) implied by the social graph.

Use 0.85 as the damping parameter so that there is a 15% chance of jumping to
another vertex at random.

In [6]:
#Question 2
pagerank_dict = nx.pagerank(G,alpha=0.85, personalization=None, max_iter=100, tol=1e-06, nstart=None, weight='weight', dangling=None)
table_pagerank = pd.Series(pagerank_dict)
sorted_table_pagerank = table_pagerank.order(ascending = False)
sorted_list_pagerank = []
for i in range(0,100):
    Index = sorted_table_pagerank.index[i]
    sorted_list_pagerank.append((str(sorted_table_pagerank.index[i]),sorted_table_pagerank[Index]))
#print sorted_list_pagerank

## 3. best_friends
Another interesting question is who tend to co-occur with each other.  Give
us the 100 edges with the highest weights.

In [8]:
#Question 3
weights = G.edges(data = True)
L = []
for (n1,n2,w) in weights:
    t = (n1,n2,w['weight'])
    L.append(t)
df = pd.DataFrame(L, columns=['node1', 'node2', 'weight'])
sorted_df = df.sort(['weight'],ascending = False)

#print sorted_df[0:100]

best_friends = []
for name1,name2,weight in sorted_df[0:100].values:
    best_friends.append(((str(name1),str(name2)),weight))
print best_friends


[(('Gillian Miniter', 'Sylvester Miniter'), 75), (('Jamee Gregory', 'Peter Gregory'), 54), (('Bonnie Comley', 'Stewart Lane'), 51), (('Andrew Saffir', 'Daniel Benedict'), 51), (('Roric Tobin', 'Geoffrey Bradfield'), 46), (('Somers Farkas', 'Jonathan Farkas'), 40), (('Jay Diamond', 'Alexandra Lebenthal'), 37), (('Donald Tober', 'Barbara Tober'), 36), (('Martin Shafiroff', 'Jean Shafiroff'), 35), (('Chappy Morris', 'Melissa Morris'), 32), (('Campion Platt', 'Tatiana Platt'), 30), (('Chris Meigher', 'Grace Meigher'), 30), (('Lizzie Tisch', 'Jonathan Tisch'), 28), (('Peter Regna', 'Barbara Regna'), 27), (('Sessa von Richthofen', 'Richard Johnson'), 27), (('John Catsimatidis', 'Margo Catsimatidis'), 27), (('Wilbur Ross', 'Hilary Geary Ross'), 26), (('Arie Kopelman', 'Coco Kopelman'), 26), (('Deborah Norville', 'Karl Wellner'), 26), (('Elizabeth Stribling', 'Guy Robinson'), 24), (('Yaz Hernandez', 'Valentin Hernandez'), 24), (('Julia Koch', 'David Koch'), 24), (('Olivia Palermo', 'Johannes H