In [None]:
import pandas as pd
import networkx as nx

# Loading the data
election_data = pd.read_excel('wikiElec.ElecBs3.xls', header=None)

In [None]:
print(election_data.head())
election_data.columns = ['Type', 'Value1', 'Value2', 'Value3', 'Value4']
print(election_data.head())

In [None]:
import pandas as pd

# Empty list to collect cleaned election data
clean_data_list = []

# Temporary dictionary to hold current election data
current_election = {}

# Iterating over each row in the dataset
for index, row in election_data.iterrows():
    if row['Type'] == 'E':
        # Start of a new election block, saving previous block if it exists
        if current_election:
            clean_data_list.append(current_election)
        current_election = {'ElectionSuccess': row['Value1']}
    elif row['Type'] == 'T':
        current_election['ClosingTime'] = pd.to_datetime(row['Value1'])
    elif row['Type'] == 'U':
        current_election['CandidateID'] = row['Value1']
        current_election['CandidateName'] = row['Value2']
    elif row['Type'] == 'N':
        current_election['NominatorID'] = row['Value1']
        current_election['NominatorName'] = row['Value2']
    elif row['Type'] == 'V':
        current_election.setdefault('Votes', []).append({
            'VoteType': int(row['Value1']),
            'VoterID': row['Value2'],
            'VoteTime': pd.to_datetime(row['Value3']),
            'VoterName': row['Value4']
        })

# Appending the last election block if it exists
if current_election:
    clean_data_list.append(current_election)

clean_data = pd.DataFrame(clean_data_list)

print(clean_data.head())

In [None]:
# Creating a unique election identifier in the clean_data DataFrame
clean_data['ElectionID'] = clean_data['CandidateID'].astype(str) + '_' + clean_data['ClosingTime'].dt.strftime('%Y%m%d%H%M%S')

# 'ElectionID' as the index
clean_data.set_index('ElectionID', inplace=True)

print(clean_data.head())


In [None]:
# Constructing a directed graph
G = nx.DiGraph()

# Adding edges from the votes data with context of specific election instances
for index, election in clean_data.iterrows():
    candidate_node = f"{election['CandidateID']}_{election['ClosingTime'].strftime('%Y%m%d%H%M%S')}"
    for vote in election['Votes']:
        voter_node = f"{vote['VoterID']}_{election['ClosingTime'].strftime('%Y%m%d%H%M%S')}"
        if vote['VoteType'] == 1:  # Assuming '1' indicates a supportive vote
            G.add_edge(voter_node, candidate_node)

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Assuming 'clean_data' is the DataFrame containing the RfA data
# Flattening the Votes column to have a single row per vote

rows = []
for index, election in clean_data.iterrows():
    candidate_id = election['CandidateID']
    for vote in election['Votes']:
        if vote['VoteType'] == 1:  # Only supportive votes
            rows.append({
                'VoterID': vote['VoterID'],
                'CandidateID': candidate_id
            })

# Creating a new DataFrame from the flattened data
votes_df = pd.DataFrame(rows)

# Creating a directed graph
G = nx.DiGraph()

# Adding edges based on the supportive votes
for index, row in votes_df.iterrows():
    voter_node = str(row['VoterID'])
    candidate_node = str(row['CandidateID'])
    G.add_edge(voter_node, candidate_node)

print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())

# Visualizing the graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=500, node_color="skyblue", font_size=8, font_weight="bold", edge_color="gray")
plt.title("Wikipedia RfA Voting Network")
plt.show()


In [None]:
# Selecting a random subset of 100 nodes
random_nodes = random.sample(G.nodes(), 100)

# Creating a subgraph with these nodes
subgraph = G.subgraph(random_nodes)

# Visualizing the subgraph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(subgraph)
nx.draw(subgraph, pos, with_labels=True, node_size=500, node_color="skyblue", font_size=8, font_weight="bold", edge_color="gray")
plt.title("Subset of 100 Nodes from the Wikipedia RfA Voting Network")
plt.show()

In [None]:
# Calculation of centrality measures
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G, k=1000)


In [None]:
# DataFrame for centrality measures
centrality_df = pd.DataFrame({
    'Node': list(G.nodes()),
    'DegreeCentrality': [degree_centrality[node] for node in G.nodes()],
    'ClosenessCentrality': [closeness_centrality[node] for node in G.nodes()],
    'BetweennessCentrality': [betweenness_centrality[node] for node in G.nodes()]
})

# Extracting ElectionSuccess by mapping from the unique 'ElectionID'
def map_election_success(node):
    if node in clean_data['CandidateID'].astype(str).values:
        return clean_data.loc[clean_data['CandidateID'].astype(str) == node, 'ElectionSuccess'].values[0]
    return None

centrality_df['ElectionSuccess'] = centrality_df['Node'].apply(map_election_success)

# Filtering out nodes with no ElectionSuccess information
centrality_df = centrality_df.dropna(subset=['ElectionSuccess'])

# Correlation analysis
degree_corr, _ = pearsonr(centrality_df['DegreeCentrality'], centrality_df['ElectionSuccess'])
closeness_corr, _ = pearsonr(centrality_df['ClosenessCentrality'], centrality_df['ElectionSuccess'])
betweenness_corr, _ = pearsonr(centrality_df['BetweennessCentrality'], centrality_df['ElectionSuccess'])

print(f"Correlation between Degree Centrality and Election Success: {degree_corr:.3f}")
print(f"Correlation between Closeness Centrality and Election Success: {closeness_corr:.3f}")
print(f"Correlation between Betweenness Centrality and Election Success: {betweenness_corr:.3f}")

4. Question

In [None]:
from scipy.stats import ttest_ind

# Calculating the activity level of each voter
voter_activity = {node: G.out_degree(node) for node in G.nodes()}

# Dictionary to store centrality and activity data for each election
election_centralities = {}

for index, election in clean_data.iterrows():
    candidate_id = str(election['CandidateID'])
    centrality_measures = {
        'degree': [],
        'closeness': [],
        'betweenness': [],
        'activity': []
    }

    for vote in election['Votes']:
        if vote['VoteType'] == 1:  # Assuming '1' indicates a supportive vote
            voter_node = str(vote['VoterID'])
            centrality_measures['degree'].append(degree_centrality.get(voter_node, 0))
            centrality_measures['closeness'].append(closeness_centrality.get(voter_node, 0))
            centrality_measures['betweenness'].append(betweenness_centrality.get(voter_node, 0))
            centrality_measures['activity'].append(voter_activity.get(voter_node, 0))  # Append activity level

    # Calculating average centralities and storing them with election success and average activity
    election_centralities[candidate_id] = {
        'avg_degree': sum(centrality_measures['degree']) / len(centrality_measures['degree']) if centrality_measures['degree'] else 0,
        'avg_closeness': sum(centrality_measures['closeness']) / len(centrality_measures['closeness']) if centrality_measures['closeness'] else 0,
        'avg_betweenness': sum(centrality_measures['betweenness']) / len(centrality_measures['betweenness']) if centrality_measures['betweenness'] else 0,
        'avg_activity': sum(centrality_measures['activity']) / len(centrality_measures['activity']) if centrality_measures['activity'] else 0,  # Average activity level
        'success': election['ElectionSuccess']
    }

# Converting the dictionary to a DataFrame for easier analysis
elections_df = pd.DataFrame.from_dict(election_centralities, orient='index')

# Splitting data into successful and unsuccessful elections
successful = elections_df[elections_df['success'] == 1]
unsuccessful = elections_df[elections_df['success'] == 0]

# Performing t-tests for each centrality measure and activity metrics
results_degree = ttest_ind(successful['avg_degree'], unsuccessful['avg_degree'], equal_var=False)
results_closeness = ttest_ind(successful['avg_closeness'], unsuccessful['avg_closeness'], equal_var=False)
results_betweenness = ttest_ind(successful['avg_betweenness'], unsuccessful['avg_betweenness'], equal_var=False)
results_activity = ttest_ind(successful['avg_activity'], unsuccessful['avg_activity'], equal_var=False)

# Printing the results for all metrics
print("T-test results for Degree Centrality:", results_degree)
print("T-test results for Closeness Centrality:", results_closeness)
print("T-test results for Betweenness Centrality:", results_betweenness)
print("T-test results for Voter Activity Level:", results_activity)