# Dataset Creation

### Imports

In [1]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

BASE_SIMULATION_DIR = '../data/simulations/'
BASE_MEASURE_DIR = '../data/measures/'
C = 'consensus_time'
F = 'opinion_change_frequency'

## Reading CSV files

### Measures

In [2]:
measures_file_list = []
for network in os.listdir(BASE_MEASURE_DIR):
    df = pd.read_csv(BASE_MEASURE_DIR + network, index_col=0)
    df.insert(0, 'type', network.split('.')[0])
    measures_file_list.append(df)
    
measures_df = pd.concat(measures_file_list)

FEATURES = list(measures_df.columns)

In [3]:
measures_df = measures_df.rename_axis(['network'])

In [4]:
measures_df = measures_df[(measures_df['average_shortest_path_lenght'] != np.inf) & (measures_df['closeness'] != np.nan)]
FEATURES = list(measures_df.columns)[1:]
measures_df.describe()

Unnamed: 0,clustering,closeness,betweenness,average_shortest_path_lenght,eigenvector,assortativity,information_centrality,approximate_current_flow_betweenness_centrality,shannon_entropy,degree_variance
count,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0
mean,0.134973,0.302541,1522.851944,4.044704,0.230008,-0.018908,0.003183,0.007993,2.980314,40.39391
std,0.238953,0.10175,1151.472082,2.302944,0.173878,0.163531,0.000841,0.00672,1.210182,59.354528
min,0.008343,0.096122,548.311,2.095622,0.048632,-0.39402,0.001307,0.00323,0.377454,3.452053
25%,0.0235,0.295687,1002.345,3.00369,0.077912,-0.028069,0.002955,0.004963,2.621872,6.344289
50%,0.026425,0.313753,1101.9025,3.202805,0.101251,-0.005222,0.003437,0.005224,3.645626,14.142136
75%,0.032066,0.334649,1198.682,3.396364,0.400087,0.117568,0.003652,0.005649,3.831956,32.907699
max,0.652846,0.478556,4797.811,10.594622,0.621034,0.204217,0.00429,0.027219,4.027738,204.959142


### Simulations

In [5]:
simulations_file_list = []
for network in os.listdir(BASE_SIMULATION_DIR):
    df = pd.read_csv(BASE_SIMULATION_DIR + network, index_col=0, header=[0,1])
    simulations_file_list.append(df)

simulations_df = pd.concat(simulations_file_list)

In [6]:
simulations_df = simulations_df.stack(level=0, future_stack=True).rename_axis(['network','initialization']).reset_index()

In [7]:
simulations_df.head()

Unnamed: 0,network,initialization,consensus_time,opinion_change_frequency
0,barabasi_linear_60.edgelist,random,471.08,248.74
1,barabasi_linear_60.edgelist,direct,535.92,200.0
2,barabasi_linear_60.edgelist,inverse,323.41,386.6
3,barabasi_linear_91.edgelist,random,431.67,248.66
4,barabasi_linear_91.edgelist,direct,519.85,200.0


### Merging and creating the dataset

In [20]:
dataset = measures_df.merge(simulations_df, on='network', how='inner')

In [21]:
from sklearn.preprocessing import LabelEncoder
dataset['groups'] = LabelEncoder().fit_transform(dataset['type'])
# dataset.reset_index(inplace=True)

In [22]:
dataset

Unnamed: 0,network,type,clustering,closeness,betweenness,average_shortest_path_lenght,eigenvector,assortativity,information_centrality,approximate_current_flow_betweenness_centrality,shannon_entropy,degree_variance,initialization,consensus_time,opinion_change_frequency,groups
0,barabasi_linear_60.edgelist,barabasi_linear,0.032393,0.333205,1008.953,3.016906,0.082985,-0.012140,0.003442,0.005008,3.641398,28.577380,random,471.08,248.74,0
1,barabasi_linear_60.edgelist,barabasi_linear,0.032393,0.333205,1008.953,3.016906,0.082985,-0.012140,0.003442,0.005008,3.641398,28.577380,direct,535.92,200.00,0
2,barabasi_linear_60.edgelist,barabasi_linear,0.032393,0.333205,1008.953,3.016906,0.082985,-0.012140,0.003442,0.005008,3.641398,28.577380,inverse,323.41,386.60,0
3,barabasi_linear_91.edgelist,barabasi_linear,0.032772,0.335566,998.585,2.996170,0.081414,-0.026616,0.003427,0.004956,3.629732,31.464265,random,431.67,248.66,0
4,barabasi_linear_91.edgelist,barabasi_linear,0.032772,0.335566,998.585,2.996170,0.081414,-0.026616,0.003427,0.004956,3.629732,31.464265,direct,519.85,200.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1669,watts-strogatz_76.edgelist,watts-strogatz,0.645758,0.117242,3798.980,8.596960,0.429491,0.008234,0.001636,0.021184,0.596770,3.452053,direct,41172.93,7879.43,4
1670,watts-strogatz_76.edgelist,watts-strogatz,0.645758,0.117242,3798.980,8.596960,0.429491,0.008234,0.001636,0.021184,0.596770,3.452053,inverse,37483.28,6954.85,4
1671,watts-strogatz_69.edgelist,watts-strogatz,0.646363,0.116023,3854.486,8.707972,0.606427,-0.011195,0.001584,0.021171,0.560623,3.452053,random,13790.04,1911.82,4
1672,watts-strogatz_69.edgelist,watts-strogatz,0.646363,0.116023,3854.486,8.707972,0.606427,-0.011195,0.001584,0.021171,0.560623,3.452053,direct,43281.68,7024.76,4


In [23]:
dataset.to_csv('../data/dataset.csv', index=False)