# Tests
The objective here is to summarize and summarize results from different response variables, initialization methods and different models using R2

In [1]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from collections import Counter

CONSENSUS_TIME = 'consensus_time'
OPINION_CHANGE_FREQUENCY = 'opinion_change_frequency'

FEATURES = [
    'clustering', 'closeness', 'betweenness',
    'average_shortest_path_lenght', 'eigenvector', 'assortativity',
    'information_centrality',
    'approximate_current_flow_betweenness_centrality', 'shannon_entropy',
    'degree_variance'
]

In [2]:
dataset = pd.read_csv('../data/dataset.csv')

## Imports

In [3]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, GroupKFold, StratifiedGroupKFold, StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import PoissonRegressor, TweedieRegressor, LinearRegression
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
import optuna

## Metrics

In [4]:
p = 1
from sklearn.metrics import r2_score

def adjusted_r2_score(y: np.array, y_hat: np.array):
    """
    Computes R² score. It decreases if many variables are introduced.
    n: number of observations in sample 
    p: number of independent variables
    """
    n = len(y_hat)
    return 1 - ((1 - r2_score(y, y_hat)) * ((n - 1)/(n - p - 1)))

## Linear Regression CV

In [31]:
def LinearRegressionCV(X, y):
    cross_validation = KFold(n_splits=10)

    splits = cross_validation.split(
        X, 
        y=y
    )

    score_list = []
    feature_list = []
    
    regressor = LinearRegression()
    feature_selector = SequentialFeatureSelector(
        regressor,
        # tol=0.01,
        n_features_to_select=2,
        direction='forward'
    )
    
    for train_index, val_index in splits:
        _X_train, _X_val = X.iloc[train_index], X.iloc[val_index]
        _y_train, _y_val = y.iloc[train_index], y.iloc[val_index]
        
        # Feature selection
        sfs = clone(feature_selector) 
        sfs.fit(_X_train, _y_train)
        selected_subset = np.array(FEATURES)[sfs.get_support()]
        p = len(selected_subset)

        # Evaluation
        model = clone(regressor)
        model.fit(_X_train[selected_subset], _y_train)
        y_val_pred = model.predict(_X_val[selected_subset])
        
        score = adjusted_r2_score(np.exp(_y_val), np.exp(y_val_pred))
        score_list.append(score)

        for selected_feature in selected_subset:
            feature_list.append(str(selected_feature))

    cv_subset = []
    for feature, _ in Counter(feature_list).most_common():
        cv_subset.append(feature)

    print(f'Mean Score: {np.mean(score_list):.3f}')

    if np.mean(score_list) < 0.9:
        print(score_list)
    
    return cv_subset[:2]

In [32]:
feature_scaler = StandardScaler()

## Tests

In [36]:
results_dict = {}

for response_variable in [OPINION_CHANGE_FREQUENCY, CONSENSUS_TIME]:
    for initialization in ['random', 'direct', 'inverse']:
        # Prepare dataset
        filtered_dataset = dataset[dataset['initialization'] == initialization]
        X = filtered_dataset[FEATURES]
        y = np.log(filtered_dataset[response_variable])
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2
        )

        # Scale and standardize
        feature_scaler.fit(X_train[FEATURES])
        X_train[FEATURES] = feature_scaler.transform(X_train[FEATURES])
        X_test[FEATURES] = feature_scaler.transform(X_test[FEATURES])
        
        # Select variables using Forward Selection
        selected_subset = LinearRegressionCV(X_train, y_train)

        # Train and predict
        model = LinearRegression()
        model.fit(X_train[selected_subset], y_train)
        y_train_pred = model.predict(X_train[selected_subset])
        train_score = adjusted_r2_score(y_train_pred, y_train)
        y_test_pred = model.predict(X_test[selected_subset])
        test_score = adjusted_r2_score(y_test_pred, y_test)

        # Save
        instance_dict = {
            'r2_train': train_score,
            'r2_test': test_score,
            'selected_features': selected_subset 
        }
        results_dict[(response_variable, initialization)] = instance_dict


Mean Score: 0.955
Mean Score: 0.972
Mean Score: 0.976
Mean Score: 0.976
Mean Score: 0.981
Mean Score: 0.738
[0.38428672555993626, 0.9190513903036354, 0.9460086912117606, 0.7495679392807035, 0.9527271520716553, 0.9422435128796287, 0.8506331243610112, -0.08578033136141716, 0.7932705944554319, 0.9251997682439684]


In [37]:
pd.DataFrame(results_dict).T

Unnamed: 0,Unnamed: 1,r2_train,r2_test,selected_features
opinion_change_frequency,random,0.989096,0.994756,"[clustering, approximate_current_flow_betweenn..."
opinion_change_frequency,direct,0.9971,0.995814,"[clustering, eigenvector]"
opinion_change_frequency,inverse,0.994974,0.993011,"[clustering, eigenvector]"
consensus_time,random,0.993358,0.993846,"[closeness, degree_variance]"
consensus_time,direct,0.990412,0.990563,"[closeness, shannon_entropy]"
consensus_time,inverse,0.992224,0.991881,"[closeness, betweenness]"
