# Dataset Creation

### Imports

In [6]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

BASE_SIMULATION_DIR = '../data/simulations/'
BASE_MEASURE_DIR = '../data/measures/'
C = 'consensus_time'
F = 'opinion_change_frequency'

## Reading CSV files

### Measures

In [7]:
measures_file_list = []
for network in os.listdir(BASE_MEASURE_DIR):
    df = pd.read_csv(BASE_MEASURE_DIR + network, index_col=0)
    df.insert(0, 'type', network.split('.')[0])
    measures_file_list.append(df)
    
measures_df = pd.concat(measures_file_list)

FEATURES = list(measures_df.columns)

In [8]:
measures_df = measures_df.rename_axis(['network'])

In [9]:
measures_df = measures_df[~(measures_df['closeness'].isna())]
FEATURES = list(measures_df.columns)[1:]
measures_df.describe()

Unnamed: 0,clustering,closeness,n
count,197.0,197.0,197.0
mean,0.518071,0.697597,1259.989848
std,0.279386,0.141018,142.531987
min,0.019483,0.381968,1013.0
25%,0.299717,0.58816,1146.0
50%,0.518177,0.674908,1269.0
75%,0.766184,0.810564,1387.0
max,0.994529,0.994563,1493.0


In [10]:
measures_df.to_csv('../data/measures.csv', index=False)

### Simulations

In [81]:
simulations_file_list = []
for network in os.listdir(BASE_SIMULATION_DIR):
    df = pd.read_csv(BASE_SIMULATION_DIR + network, index_col=0, header=[0,1])
    simulations_file_list.append(df)

simulations_df = pd.concat(simulations_file_list)

In [82]:
simulations_df = simulations_df.stack(level=0, future_stack=True).rename_axis(['network','initialization']).reset_index()

In [83]:
simulations_df.head()

Unnamed: 0,network,initialization,consensus_time,opinion_change_frequency
0,erdos_renyi_78.edgelist,random,5.93,343.76
1,erdos_renyi_100.edgelist,random,6.7,360.48
2,erdos_renyi_153.edgelist,random,10.2,365.38
3,erdos_renyi_0.edgelist,random,4.04,305.61
4,erdos_renyi_151.edgelist,random,4.95,310.04


### Merging and creating the dataset

In [84]:
dataset = measures_df.merge(simulations_df, on='network', how='inner')

In [85]:
from sklearn.preprocessing import LabelEncoder
dataset['groups'] = LabelEncoder().fit_transform(dataset['type'])
# dataset.reset_index(inplace=True)

In [86]:
dataset

Unnamed: 0,network,type,clustering,closeness,n,initialization,consensus_time,opinion_change_frequency,groups
0,erdos_renyi_78.edgelist,erdos_renyi,0.475252,0.655912,1474.0,random,5.93,343.76,0
1,erdos_renyi_100.edgelist,erdos_renyi,0.436546,0.639639,1480.0,random,6.70,360.48,0
2,erdos_renyi_153.edgelist,erdos_renyi,0.309121,0.591442,1333.0,random,10.20,365.38,0
3,erdos_renyi_0.edgelist,erdos_renyi,0.644705,0.737895,1415.0,random,4.04,305.61,0
4,erdos_renyi_151.edgelist,erdos_renyi,0.563176,0.696041,1281.0,random,4.95,310.04,0
...,...,...,...,...,...,...,...,...,...
192,erdos_renyi_50.edgelist,erdos_renyi,0.925410,0.930627,1397.0,random,2.23,299.40,0
193,erdos_renyi_69.edgelist,erdos_renyi,0.946267,0.949052,1071.0,random,2.08,246.75,0
194,erdos_renyi_7.edgelist,erdos_renyi,0.083608,0.521646,1330.0,random,42.25,344.00,0
195,erdos_renyi_160.edgelist,erdos_renyi,0.940409,0.943801,1209.0,random,1.94,284.21,0


In [87]:
dataset.to_csv('../data/dataset.csv', index=False)