# Prediction of Dynamic Variables in Sznajd Model

### Imports

In [41]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np

BASE_SIMULATION_DIR = 'data/simulations/'
BASE_MEASURE_DIR = 'data/measures/'
C = 'consensus_time'
F = 'opinion_change_frequency'

## Reading CSV files

### Measures

In [2]:
measures_file_list = []
for network in os.listdir(BASE_MEASURE_DIR):
    measures_file_list.append(pd.read_csv(BASE_MEASURE_DIR + network, index_col=0))

measures_df = pd.concat(measures_file_list)

FEATURES = list(measures_df.columns)

In [3]:
measures_df = measures_df.rename_axis(['network'])
measures_df

Unnamed: 0_level_0,clustering,closeness,betweenness,average_shortest_path_lenght,eigenvector,assortativity,information_centrality,approximate_current_flow_betweenness_centrality
network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
barabasi_linear_1.edgelist,0.030791,0.336370,994.336,2.987672,0.086131,-0.045484,0.003411,0.005010
barabasi_linear_0.edgelist,0.030984,0.336728,992.753,2.984506,0.083921,-0.041009,0.003408,0.004998
barabasi_linear_7.edgelist,0.031743,0.334444,1003.160,3.005320,0.074283,-0.028950,0.003429,0.004873
barabasi_linear_8.edgelist,0.030694,0.335118,999.887,2.998774,0.078044,-0.035307,0.003420,0.005012
barabasi_linear_4.edgelist,0.030858,0.332497,1012.013,3.023026,0.086636,-0.005945,0.003436,0.004902
...,...,...,...,...,...,...,...,...
watts-strogatz_2.edgelist,0.648514,0.103962,4426.792,9.852584,0.533460,-0.019618,0.001387,0.024015
watts-strogatz_3.edgelist,0.649342,0.105674,4267.552,9.534104,0.501900,0.004032,0.001458,0.024564
watts-strogatz_5.edgelist,0.653446,0.096471,4754.780,10.508560,0.418047,0.009770,0.001283,0.027294
watts-strogatz_7.edgelist,0.645121,0.119526,3732.701,8.464402,0.514369,-0.010666,0.001624,0.022314


### Simulations

In [4]:
simulations_file_list = []
for network in os.listdir(BASE_SIMULATION_DIR):
    simulations_file_list.append(pd.read_csv(BASE_SIMULATION_DIR + network, index_col=0, header=[0,1]))

simulations_df = pd.concat(simulations_file_list)

In [5]:
simulations_df = simulations_df.stack(level=0, future_stack=True).rename_axis(['network','initialization']).reset_index(1)
simulations_df

Unnamed: 0_level_0,initialization,consensus_time,opinion_change_frequency
network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
barabasi_linear_0.edgelist,random,619.0,196.0
barabasi_linear_0.edgelist,direct,264.0,199.0
barabasi_linear_0.edgelist,inverse,871.0,1577.0
barabasi_nonlinear_15_0.edgelist,random,79.0,204.0
barabasi_nonlinear_15_0.edgelist,direct,51.0,199.0
barabasi_nonlinear_15_0.edgelist,inverse,52.0,323.0
erdos_renyi_0.edgelist,random,1000.0,199.0
erdos_renyi_0.edgelist,direct,1000.0,216.0
erdos_renyi_0.edgelist,inverse,353.0,273.0
waxman_0.edgelist,random,501.0,239.0


### Merging and creating the dataset

In [6]:
dataset = measures_df.merge(simulations_df, on='network', how='right')

## Training and prediction

In [27]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import clone

In [8]:
X = dataset[FEATURES]
y = dataset[C]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [22]:
model = DecisionTreeRegressor()

### Cross Validation

In [35]:
cross_validation = KFold(n_splits=5)
splits = cross_validation.split(X, y=y)

r2_score_list = []

for train_index, val_index in tqdm(splits):
    # Divide dataset de acordo com os splits da validação cruzada
    _X_train, _X_val = X.iloc[train_index], X.iloc[val_index]
    _y_train, _y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Intanciate and train regressor
    regressor = clone(model)
    regressor.fit(_X_train, _y_train)
    
    # Predict X value
    y_val_pred = regressor.predict(_X_val)
    
    # Compute R2 Score
    score = r2_score(_y_val, y_val_pred)
    r2_score_list.append(score)

5it [00:00, 57.97it/s]


In [42]:
print(f'Mean R2 Score: {np.mean(r2_score_list)} +- {np.std(r2_score_list)}')

Mean R2 Score: -301.40228252266616 +- 602.304178372336


### Testing

In [39]:
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
score = r2_score(y_test, y_test_pred)
print(f'R2 Score on test: {score}')

R2 Score on test: -1.0138996473963724
