In [1]:
import igraph
import pandas as pd

from collections import Counter


## Load graph

In [2]:
g = igraph.read('pycon.graphml', format='graphml')

In [3]:
g.summary()

'IGRAPH UN-- 494 1664 -- \n+ attr: city (v), education (v), followers (v), friends (v), gender (v), id (v), locale (v), name (v), status (v), work (v), work_employer (v)'

In [4]:
g.to_undirected()

In [5]:
g_connected = g.clusters().giant()

In [6]:
g_connected.summary()

'IGRAPH UN-- 391 1647 -- \n+ attr: city (v), education (v), followers (v), friends (v), gender (v), id (v), locale (v), name (v), status (v), work (v), work_employer (v)'

## "Important" persons

In [7]:
def get_top(g, metric, n=10):
    return sorted(zip(getattr(g, metric)(), g.vs['name']), reverse=True)[:n]

In [8]:
get_top(g_connected, 'degree')

[(58, 'Taras  Lyapun'),
 (51, 'Volodymyr Hotsyk'),
 (49, 'Mikhail Kashkin'),
 (47, 'Ksenya Baluk'),
 (40, 'Yehor Nazarkin'),
 (34, 'Miya Zheplinska'),
 (33, 'Paul Colomiets'),
 (33, 'Kyrylo Perevozchikov'),
 (32, 'Marianna Mavdryk'),
 (32, 'Alexander Lyabah')]

In [9]:
get_top(g_connected, 'closeness')

[(0.43869516310461193, 'Ksenya Baluk'),
 (0.4157782515991471, 'Volodymyr Hotsyk'),
 (0.4140127388535032, 'Taras  Lyapun'),
 (0.4118268215417107, 'Mikhail Kashkin'),
 (0.3951367781155015, 'Alexander Lyabah'),
 (0.3943377148634985, 'Yehor Nazarkin'),
 (0.39117352056168503, 'Miya Zheplinska'),
 (0.38961038961038963, 'Irina  Sulatskaya'),
 (0.38767395626242546, 'Marianna Mavdryk'),
 (0.3872889771598808, 'Igor Gor Lushchyk')]

In [10]:
get_top(g_connected, 'betweenness')

[(10103.151498583247, 'Ksenya Baluk'),
 (7935.2898040095115, 'Taras  Lyapun'),
 (6217.986765676486, 'Mikhail Kashkin'),
 (5563.020852769037, 'Volodymyr Hotsyk'),
 (3687.125635951954, 'Alexander Lyabah'),
 (3509.7810585103975, 'Miya Zheplinska'),
 (3359.463606093812, 'Ira  Osoba'),
 (3227.7578038468246, 'Orysia Khimiak'),
 (2735.1826063529993, 'Marianna Mavdryk'),
 (2723.280343768928, 'Mariana Kira')]

In [11]:
important_persons =  pd.DataFrame({
    'degree': [name for _, name in get_top(g_connected, 'degree')],
    'closeness': [name for _, name in get_top(g_connected, 'closeness')],
    'betweenness': [name for _, name in get_top(g_connected, 'betweenness')],
    'eigenvector_centrality': [name for _, name in get_top(g_connected, 'eigenvector_centrality')],
    'pagerank': [name for _, name in get_top(g_connected, 'pagerank')],
})

In [12]:
important_persons

Unnamed: 0,betweenness,closeness,degree,eigenvector_centrality,pagerank
0,Ksenya Baluk,Ksenya Baluk,Taras Lyapun,Taras Lyapun,Taras Lyapun
1,Taras Lyapun,Volodymyr Hotsyk,Volodymyr Hotsyk,Volodymyr Hotsyk,Mikhail Kashkin
2,Mikhail Kashkin,Taras Lyapun,Mikhail Kashkin,Mikhail Kashkin,Volodymyr Hotsyk
3,Volodymyr Hotsyk,Mikhail Kashkin,Ksenya Baluk,Yehor Nazarkin,Ksenya Baluk
4,Alexander Lyabah,Alexander Lyabah,Yehor Nazarkin,Paul Colomiets,Yehor Nazarkin
5,Miya Zheplinska,Yehor Nazarkin,Miya Zheplinska,Ksenya Baluk,Miya Zheplinska
6,Ira Osoba,Miya Zheplinska,Paul Colomiets,Kyrylo Perevozchikov,Orysia Khimiak
7,Orysia Khimiak,Irina Sulatskaya,Kyrylo Perevozchikov,Alexander Lyabah,Aleksey Radchenko
8,Marianna Mavdryk,Marianna Mavdryk,Marianna Mavdryk,Igor Davydenko,Alexander Lyabah
9,Mariana Kira,Igor Gor Lushchyk,Alexander Lyabah,Vsevolod Solovyov,Kyrylo Perevozchikov


## Clusters

In [13]:
multilevel_communities = g_connected.community_multilevel()

In [14]:
multilevel_communities.sizes()

[28, 3, 6, 99, 76, 28, 60, 8, 22, 45, 13, 3]

In [15]:
multilevel_communities.modularity

0.524317363836808

In [16]:
walktrap = g_connected.community_walktrap(steps=10)

In [17]:
walktrap_communities = walktrap.as_clustering(n=10)

In [18]:
walktrap_communities.sizes()

[327, 2, 6, 3, 2, 6, 33, 8, 2, 2]

In [19]:
g_connected.vs.set_attribute_values('multilevel_communities',multilevel_communities.membership)
g_connected.vs.set_attribute_values('walktrap_communities',walktrap_communities.membership)

In [20]:
g_connected.write('pycon_final.graphml')

## Investigate community

In [44]:
COMMUNITY_NUMBER = 6

In [45]:
user_likes = pd.read_csv('user_likes.csv', encoding='utf-8')

In [46]:
pages = pd.read_csv('pages.csv', encoding='utf-8')

In [47]:
user_page_likes = pd.crosstab(user_likes['user'], user_likes['page'])

In [48]:
clusters = pd.DataFrame({'id': g_connected.vs['id'], 'cluster': walktrap_communities.membership})

In [49]:
clusters['id'] = clusters['id'].astype('int64')
clusters = clusters.set_index('id')

In [50]:
users_info = pd.concat([user_page_likes, clusters], axis=1)

In [51]:
from sklearn.feature_selection import SelectKBest, chi2

In [52]:
X_train = users_info.iloc[:,:-1].fillna(0)
y_train = users_info['cluster'].map(lambda x: x if x == COMMUNITY_NUMBER else -1)

In [53]:
Counter(y_train)

Counter({-1.0: 443, 6.0: 33})

In [54]:
ch2 = SelectKBest(chi2, k=10)
ch2.fit(X_train, y_train)

SelectKBest(k=10, score_func=<function chi2 at 0x111a02400>)

In [55]:
pages[pages['id'].isin(X_train.columns[ch2.get_support()])]

Unnamed: 0.1,Unnamed: 0,category,id,name
2510,2510,education,1302106199823638,Бібліотека Українського католицького університету
15865,15865,education,205189619549019,UCU English Club
26684,26684,college & university,420728451292296,Український католицький університет UCuniversity
34976,34976,education,1438295956464171,Програма з комп'ютерних наук УКУ/ CS UCU
38954,38954,college & university,315310201979689,UCU International Office
41238,41238,education,1221107544577367,Мапа можливостей УКУ
41820,41820,local business,1813014238943812,Faculty of Applied Sciences of UCU
43044,43044,organization,226981620835395,Уряд Студентів УКУ/ UCU Student Government
58658,58658,computer company,713464742143141,Ukrainian Catholic Programmer ___ Український ...
61939,61939,education,102537843572610,Київський центр УКУ
