# Linear Regression Model for Cues Aggregation

In [1]:
import os
os.chdir("../")
from dice import Inputs
from dice.constants import Dimensions
from dice.misc import Table
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
INPUTS_FOLDER = "scratch/conceptnet"
INPUTS_SOURCE = "ConceptNet"
ANNOTATION_PATH = "data/annotation/ppref_gold_all_2019_08_12.csv"

## 1. Data Pre-Processing

Gathering "features".

In [3]:
inputs = Inputs(INPUTS_FOLDER)
detective = inputs.get_detective()
cues = {cls.__name__: d for cls, d in detective.cues.items()}
df_cues = pd.DataFrame(cues)
df_annotation = pd.read_csv(ANNOTATION_PATH)
df_annotation = df_annotation.where(df_annotation["source_1"] == INPUTS_SOURCE).dropna()
indices = sorted(set(df_annotation["index_1"]).union(df_annotation["index_2"]))
X = df_cues.loc[indices]
X.shape



(399, 7)

Building target values from pairwise preference.

In [4]:
y = pd.DataFrame(index=X.index)
default = [.5 for _ in X.iterrows()]
for dimension in Dimensions.iter():
    y[dimension] = default[:]
    for index, row in df_annotation.iterrows():
        i1 = row["index_1"]
        i2 = row["index_2"]
        if row[dimension] > 3:
            y.loc[i1][dimension] = 0
            y.loc[i2][dimension] = 1
        elif row[dimension] < 3:
            y.loc[i1][dimension] = 1
            y.loc[i2][dimension] = 0
y.shape

(399, 4)

Splitting into train/test sets.

In [5]:
split = int(.66 * X.shape[0])

X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

X_train (263, 7)
X_test (136, 7)
y_train (263, 4)
y_test (136, 4)


## 2. Regression

Linear regression.

In [6]:
table = [["Dimension", "Train score", "Test score"]]
coefs = {
    dimension: {
        key: None
        for key in list(X.columns) + ["Intercept"]
    }
    for dimension in Dimensions.iter()
}
for dimension in Dimensions.iter():
    model = LinearRegression()
    model.fit(X_train, y_train[dimension])
    for key, value in zip(list(X.columns) + ["Intercept"], list(model.coef_) + [model.intercept_]):
        coefs[dimension][key] = value
    table.append([
        Dimensions.label(dimension),
        str(round(model.score(X_train, y_train[dimension]), 3)),
        str(round(model.score(X_test, y_test[dimension]), 3)),
    ])
print(Table(table))
pd.DataFrame(coefs).rename(columns={
    dimension: Dimensions.label(dimension)
    for dimension in Dimensions.iter()
})

 Dimension | Train score | Test score
---------- | ----------- | ----------
 Plausible |    0.021    |   -0.087  
  Typical  |    0.068    |   -0.092  
Remarkable |    0.075    |    0.053  
  Salient  |    0.045    |    0.059  


Unnamed: 0,Plausible,Typical,Remarkable,Salient
ContradictionCue,0.054704,-0.001074958,-0.3154162,-0.197963
EntailmentCue,0.171114,-0.3267008,0.008643962,0.194674
EntropyCue,0.073193,0.02613345,-0.04412262,-0.06467
ImplicationCue,-8.563788,-23.8687,54.22901,-16.660352
Intercept,8.976385,24.45182,-53.68325,17.175023
JointCue,761462.151479,-2105292.0,2702355.0,391185.526867
NecessityCue,-3.804746,-21.8102,34.83016,26.387986
SufficiencyCue,-0.43858,0.667121,-0.3262629,-0.158179


Printing coefficients in a copy/pastable format.

In [7]:
for dimension in coefs:
    for cue in coefs[dimension]:
        print(
            "COEF_" + Dimensions.label(dimension).upper() + "_" + cue.upper().replace("CUE", ""),
            "=",
            coefs[dimension][cue]
        )

COEF_PLAUSIBLE_ENTROPY = 0.07319310712581154
COEF_PLAUSIBLE_JOINT = 761462.151478788
COEF_PLAUSIBLE_NECESSITY = -3.804745786732383
COEF_PLAUSIBLE_SUFFICIENCY = -0.4385799410229083
COEF_PLAUSIBLE_IMPLICATION = -8.563788354368626
COEF_PLAUSIBLE_CONTRADICTION = 0.054703906062059104
COEF_PLAUSIBLE_ENTAILMENT = 0.1711135775403818
COEF_PLAUSIBLE_INTERCEPT = 8.976384686940419
COEF_TYPICAL_ENTROPY = 0.026133454201674688
COEF_TYPICAL_JOINT = -2105291.702746237
COEF_TYPICAL_NECESSITY = -21.810199417310287
COEF_TYPICAL_SUFFICIENCY = 0.6671210307395086
COEF_TYPICAL_IMPLICATION = -23.868703607996395
COEF_TYPICAL_CONTRADICTION = -0.0010749581269919872
COEF_TYPICAL_ENTAILMENT = -0.3267008303082548
COEF_TYPICAL_INTERCEPT = 24.451817729183173
COEF_REMARKABLE_ENTROPY = -0.044122616549911835
COEF_REMARKABLE_JOINT = 2702355.3362207045
COEF_REMARKABLE_NECESSITY = 34.830161827936536
COEF_REMARKABLE_SUFFICIENCY = -0.326262857648544
COEF_REMARKABLE_IMPLICATION = 54.22900918760752
COEF_REMARKABLE_CONTRADICTION

## 3. Regressors Comparison

In [8]:
regressors = (
    ("MLP", MLPRegressor),
    ("RF", RandomForestRegressor),
    ("LinReg", LinearRegression),
)

train = {
    Dimensions.label(dimension): {
        key: None
        for key in [x[0] for x in regressors]
    }
    for dimension in Dimensions.iter()
}

test = {
    Dimensions.label(dimension): {
        key: None
        for key in [x[0] for x in regressors]
    }
    for dimension in Dimensions.iter()
}

for name, cls in regressors:
    for dimension in Dimensions.iter():
        model = cls()
        model.fit(X_train, y_train[dimension])
        label = Dimensions.label(dimension)
        train[label][name] = model.score(X_train, y_train[dimension])
        test[label][name] = model.score(X_test, y_test[dimension])

print("Train R2")
print(pd.DataFrame(train))
print("\nTest R2")
print(pd.DataFrame(test))

Train R2
        Plausible   Typical  Remarkable   Salient
LinReg   0.020625  0.067768    0.074897  0.045058
MLP      0.014458  0.041509    0.032982  0.027860
RF       0.802374  0.792857    0.796818  0.822277

Test R2
        Plausible   Typical  Remarkable   Salient
LinReg  -0.087163 -0.092273    0.052678  0.059447
MLP     -0.072432 -0.082921    0.077157  0.022396
RF      -0.219514 -0.252401   -0.174938 -0.107835


