In [None]:
from wiki import WikiCorpus, PICKLE_FILE, CORPUS_DIR
from tqdm import tqdm

try:
    corpus = WikiCorpus.from_pickle(PICKLE_FILE)
except:
    corpus = WikiCorpus.from_corpus_files()
    corpus.generate_network('all_users', normalize_edge_weights=False)
    corpus.to_pickle(PICKLE_FILE)

100%|██████████| 26397/26397 [00:00<00:00, 301294.64it/s]

Opening pickle...
Opening corpus files...
Loading users...
Loading posts...





# Social Network Features

### Eigenvector centrality

In [None]:
import networkx as nx
import numpy

eigen_central = nx.eigenvector_centrality_numpy(corpus.networks['all_users'])
corpus.register_user_data('eigen_central', eigen_central)

mean = numpy.mean(list(eigen_central.values()))
stddev = numpy.std(list(eigen_central.values()))
eigen_central_bin = {user: e > mean + stddev for user, e in eigen_central.items()}
corpus.register_user_data('eigen_central_bin', eigen_central_bin)

### Community clustering (Louvain)

In [None]:
import community # https://github.com/taynaud/python-louvain

partition = community.best_partition(corpus.networks['all_users'])
corpus.register_user_data('community', partition)

We can visualize the Wikipedia network by considering community cluster as its own node.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from math import log

clusters = nx.Graph()
clusters.add_nodes_from(set(partition.values()), weight=0)

for user in corpus.networks['all_users'].nodes():
    clusters.node[partition[user]]['weight'] += 1

for (u, v), weight in nx.get_edge_attributes(corpus.networks['all_users'], 'weight').items():
    u, v = partition[u], partition[v]
    if not clusters.has_edge(u, v):
        clusters.add_edge(u, v, weight=weight)
    else:
        clusters.edge[u][v]['weight'] += weight
        
nodes = nx.get_node_attributes(clusters, 'weight').items()
edges = nx.get_edge_attributes(clusters, 'weight').items()

nodes, node_weights = zip(*nodes)
edges, edge_weights = zip(*edges)

node_weights = [w / max(node_weights) * 300 for w in node_weights]
edge_weights = [w / max(edge_weights) * 10 for w in edge_weights]

pos = nx.random_layout(clusters)
nx.draw_networkx_nodes(clusters, pos, nodes, node_size=node_weights)
nx.draw_networkx_labels(clusters, pos)
nx.draw_networkx_edges(clusters, pos, edgelist=edges, width=edge_weights)

plt.draw()

#  Coordination

For a user $b$ and a group of users $A$, let $S^A_b$ be the set of pairs of utterances $(u_a, u_b)$ where $u_b$ is utterd by $b$ in reply to the parent utterance $u_a$, uttered by $a \in A$ 

$\mathcal{E}_m(u)$ means that utterance $u$ exhibits some linguistic marker, $m$.

Following *Echoes of Power* we define the coordination of user $b$ towards a group $A$ (the *coordination given* by $b$) as follows:
$$
C^g_m(A,b) = P\big[\mathcal{E}_m(u_b) \mid \mathcal{E}_m(u_a) \land (u_a, u_b) \in S^A_b\big] -
P\big[\mathcal{E}_m(u_b) \mid (u_a, u_b) \in S^A_b\big]
$$

The probabilities are estimated by counting occurances of $m$ in $S^A_b$:

$$
C^g_m(A,b) \approx \sum_{(u_a,u_b)\in S^A_b}\Big({
\frac{[\mathcal{E}_m(u_a) \land \mathcal{E}_m(u_b)]}{[\mathcal{E}_m(u_a)]} - 
\frac{[\mathcal{E}_m(u_b)]}{1}}  \Big)
$$

$C^m(A,b)$ is defined for $m$, $b$ and $A$ where $b$ where $\{(u_a, u_b) \in S^A_b \mid \mathcal{E}_m(u_a)\} \neq \varnothing $.

Likewise, we estimate the coordination of a group $A$ towards a user $b$ (the *coordination received* by $b$) as:

$$
C^r_m(A,b) \approx \sum_{(u_b,u_a)\in S^b_A}\Big({
\frac{[\mathcal{E}_m(u_b) \land \mathcal{E}_m(u_a)]}{[\mathcal{E}_m(u_b)]} - 
\frac{[\mathcal{E}_m(u_a)]}{1}}  \Big)
$$

where $S^b_A$ is the set of pairs of utterances where a member of group $A$ is replying to an utteance of user $b$ (note that this is an entirely distinct set from $S^A_b$).

As before, $C^r_m(A,b)$ is defined if $\{(u_b, u_a) \in S^b_A \mid \mathcal{E}_m(u_b)\} \neq \varnothing $

In both cases, to aggregate over markers, we take the average of the marker-specific coordination measures for which $C^*_m(A,b)$ is defined.

First, we calculate each user's coordination (given and received) with respect to the general population:

In [None]:
coord_given, coord_received  = corpus.get_coordination()
corpus.register_user_data('coord_given_all', coord_given['agg3'])
corpus.register_user_data('coord_received_all', coord_received['agg3'])
# we could easily also register the per-marker coordination measures here.

Next, we calculate users' coordination with respect to their Louvain sub-group:

In [None]:
n_communities = max(user.data['community'] if user.data['community'] else -1 for user in corpus.users.values())
ingroup_coord_given, ingroup_coord_received = {}, {}
for community_id in range(n_communities):
    ingroup = [user.id for user in corpus.users.values() if user.data['community'] == community_id]
    print('Calculating coordination for community {} ({} people)...'.format(community_id, len(ingroup)))
    coord_given, coord_received = corpus.get_coordination(ingroup, ingroup)
    ingroup_coord_given.update(coord_given)
    ingroup_coord_received.update(coord_received)
    # could also calculate for out-group coordination
corpus.register_user_data('coord_given_ingroup', ingroup_coord_given['agg3'])
corpus.register_user_data('coord_received_ingroup', ingroup_coord_received['agg3'])

# Linguistc Style Features

In [None]:
from corpus import markers
from collections import Counter, defaultdict
import re

post_count = Counter()
italics_count = Counter()
bold_count = Counter()
link_count = Counter()
function_words_count = Counter()
total_tokens = Counter()

for post in corpus.posts.values():  
    user = post.author_id
    post_count[user] += 1
    if re.search("''''.+''''", post.clean_text):
        bold_count[user] += 1
        italics_count[user] += 1
    else:
        if re.search("'''.+'''''", post.clean_text):
            bold_count[user] += 1
        if re.search("''.+''", post.clean_text):
            italics_count[user] += 1           
    if re.search("[[.+]]", post.clean_text):
        link_count[user] += 1
    for t in post.get_tokens():
        if any(t.lower() in markers[m] for m in markers):
            function_words_count[user] += 1
    total_tokens[user] += len(post.tokens)

def per_post(item_count):
    return {user: item_count[user] / post_count[user] if post_count[user] else None for user in item_count}

corpus.register_user_data('post_count', post_count)
corpus.register_user_data('italics_freq', per_post(italics_count))
corpus.register_user_data('bold_freq', per_post(bold_count))
corpus.register_user_data('link_freq', per_post(link_count))
corpus.register_user_data('function_words_freq', per_post(function_words_count))
corpus.register_user_data('avg_length_tokens', per_post(total_tokens))


In [None]:
corpus.to_pickle(PICKLE_FILE)
corpus.export_user_data(CORPUS_DIR + 'user_data.csv', blacklist = [])