# Modeling

In [None]:
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, ensemble
from sklearn.linear_model._base import LinearModel
from sklearn.ensemble._forest import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from typing import List, Callable
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, explained_variance_score, max_error
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.base import _is_fitted

## Data Loading

Loading the different splits of the dataset.

In [None]:
df_train = pd.read_csv('data/interaction_train_set.csv', sep=',', header=0)
df_test = pd.read_csv('data/interaction_test_set.csv', sep=',', header=0)
df_val = pd.read_csv('data/interaction_val_set.csv', sep=',', header=0)

display(df_train)
display(df_test)
display(df_val)

In [None]:
scaled_data = preprocessing.scale(df_train[['reciprocity', 'multiplexity', 'closeness', 'sentiment','interactionFrequency']].T)

In [None]:
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)
pca.transform

In [None]:
# The percentage of variation that each PC accounts for 
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
# Create labels for Scree plot
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

In [None]:
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

In [None]:
pp = sns.pairplot(data=df_train,
                  y_vars=['tieStrength'],
                  x_vars=['reciprocity', 'multiplexity', 'closeness', 'sentiment', 'interactionFrequency'])

## Train Model

In [None]:

x_params = ['reciprocity', 'multiplexity', 'closeness', 'sentiment', 'interactionFrequency']
y_params = "tieStrength"

In [None]:
def train_model(model, train:pd.DataFrame, validation:pd.DataFrame, x:List[str], y:str, metrics:Callable, pca=None):
    train_x = train[x].to_numpy()
    train_y = train[y].to_numpy()
    validation_x = validation[x].to_numpy()
    validation_y = validation[y].to_numpy()

    if pca != None:
        train_x = pca.fit_transform(train_x) if not _is_fitted(pca) else pca.transform(train_x)
        validation_x = pca.transform(validation_x)

    model.fit(train_x, train_y)
    validation_prediction_y = model.predict(validation_x)
    
    results = []
    model_name = model.__class__.__name__
    validation_results = {"model_name": model_name, "set_name": "validation"}
    for metric in metrics:
        metric_name = metric.__name__
        validation_results[metric_name] = metric(validation_y, validation_prediction_y)
    results.append(validation_results)
    train_prediction_y = model.predict(train_x)
    train_results = {"model_name": model_name, "set_name": "training"}
    for metric in metrics:
        metric_name = metric.__name__
        train_results[metric_name] = metric(train_y, train_prediction_y)
    results.append(train_results)
    return results


In [None]:
models = [linear_model.LinearRegression(),ensemble.RandomForestRegressor(), linear_model.Lasso(), linear_model.ElasticNet(),linear_model.Ridge(), linear_model.PoissonRegressor(), MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(5,4,3,), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=200,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='lbfgs',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False), SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1)]
metrics = [mean_squared_error, root_mean_squared_error, mean_absolute_error, explained_variance_score, max_error]

In [None]:

def train_models(model, train:pd.DataFrame, validation:pd.DataFrame, x:List[str], y:str, metrics:List[Callable], pca=None):
    results = []
    for model in models:
        results.extend(train_model(model, train, validation, x, y, metrics, pca))
    return pd.DataFrame(results)
        
model_results = train_models(models, df_train, df_val, x_params, y_params, metrics)
model_results

## Evaluation

In [None]:
def eval_model(model, test:pd.DataFrame, x:List[str], y:str, metrics:List[Callable], pca=None):
    test_x = test[x].to_numpy()
    test_y = test[y].to_numpy()

    if pca != None:
        test_x = pca.transform(test_x)

    predicted_y = model.predict(test_x)
    results = []
    model_name = model.__class__.__name__
    test_results = {"model_name": model_name, "set_name": "test"}
    for metric in metrics:
        metric_name = metric.__name__
        test_results[metric_name] = metric(test_y, predicted_y)
    results.append(test_results)
    return results



In [None]:
def eval_models(model, test:pd.DataFrame, x:List[str], y:str, metrics:List[Callable], pca=None):
    results = []
    for model in models:
        results.extend(eval_model(model, test, x, y, metrics, pca))
    return pd.DataFrame(results)

eval_results = eval_models(models, df_test, x_params, y_params, metrics)
eval_results
        

In [None]:
results = pd.concat([eval_results, model_results])
results = results.sort_values("model_name")
results

## Visualization

In [None]:
grouped_results = results.set_index(["model_name"])
grouped_results

In [None]:

grouped_results[['set_name', 'root_mean_squared_error']].pivot(columns='set_name').plot.bar()

In [None]:
df_test.describe()