In [1]:
%%capture
from featureeng import preprocessing

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import gc
import scipy.stats

import matplotlib.pyplot as plt
%matplotlib inline

## Load Dataset

In [3]:
# Load the raw data
df = pd.read_csv("data/ratings.csv", encoding="ISO-8859-1")
# df = df[:20]
# Input- & Output-Variables
texts = df["Sentence"].values 
y1mos = df["MOS_Complexity"].values
y1std = df["Std_Complexity"].values
y2mos = df["MOS_Understandability"].values
y2std = df["Std_Understandability"].values
y3mos = df["MOS_Lexical_difficulty"].values
y3std = df["Std_Lexical_difficulty"].values
# free memory
del df
gc.collect()

4

## Feature Engineering
approx 1 hr.

In [4]:
%%time
feats1, feats2, feats3, feats4, feats5, feats6 = preprocessing(texts)

CPU times: user 51min 55s, sys: 5min 14s, total: 57min 9s
Wall time: 50min 41s


In [5]:
feats1.shape, feats2.shape, feats3.shape, feats4.shape, feats5.shape, feats6.shape

((1000, 384), (1000, 21), (1000, 56), (1000, 38), (1000, 3), (1000, 20))

In [6]:
xrnd1 = np.random.random(size=feats1.shape)
xrnd1 = (xrnd1.T / xrnd1.sum(axis=1)).T

xrnd2 = np.random.random(size=feats2.shape)
xrnd2 = (xrnd2.T / xrnd2.sum(axis=1)).T

xrnd3 = np.random.random(size=feats3.shape)
xrnd3 = (xrnd3.T / xrnd3.sum(axis=1)).T

xrnd4 = np.random.random(size=feats4.shape)
xrnd4 = (xrnd4.T / xrnd4.sum(axis=1)).T

xrnd5 = np.random.random(size=feats5.shape)
xrnd5 = (xrnd5.T / xrnd5.sum(axis=1)).T

xrnd6 = np.random.random(size=feats6.shape)
xrnd6 = (xrnd6.T / xrnd6.sum(axis=1)).T

In [7]:
xinputs0 = np.hstack([feats1, feats2, feats3, feats4, feats5, feats6])
xinputs1 = np.hstack([xrnd1, feats2, feats3, feats4, feats5, feats6])
xinputs2 = np.hstack([feats1, xrnd2, feats3, feats4, feats5, feats6])
xinputs3 = np.hstack([feats1, feats2, xrnd3, feats4, feats5, feats6])
xinputs4 = np.hstack([feats1, feats2, feats3, xrnd4, feats5, feats6])
xinputs5 = np.hstack([feats1, feats2, feats3, feats4, xrnd5, feats6])
xinputs6 = np.hstack([feats1, feats2, feats3, feats4, feats5, xrnd6])

xinputs0.shape, xinputs1.shape, xinputs2.shape, xinputs3.shape, xinputs4.shape, xinputs5.shape, xinputs6.shape

((1000, 522),
 (1000, 522),
 (1000, 522),
 (1000, 522),
 (1000, 522),
 (1000, 522),
 (1000, 522))

## Load Model

In [8]:
model = tf.keras.models.load_model(
    "best-model-370c-1-1", compile=False)

2022-06-13 20:12:41.352683: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Inference

In [9]:
y0_pred = model.predict(xinputs0)
y0 = y0_pred['mos']

y1_pred = model.predict(xinputs1)
y1 = y1_pred['mos']

y2_pred = model.predict(xinputs2)
y2 = y2_pred['mos']

y3_pred = model.predict(xinputs3)
y3 = y3_pred['mos']

y4_pred = model.predict(xinputs4)
y4 = y4_pred['mos']

y5_pred = model.predict(xinputs5)
y5 = y5_pred['mos']

y6_pred = model.predict(xinputs6)
y6 = y6_pred['mos']

## Loss Functions

In [32]:
l0 = np.power(np.c_[y1mos, y2mos, y3mos] - y0, 2).mean(axis=0)
l1 = np.power(np.c_[y1mos, y2mos, y3mos] - y1, 2).mean(axis=0)
l2 = np.power(np.c_[y1mos, y2mos, y3mos] - y2, 2).mean(axis=0)
l3 = np.power(np.c_[y1mos, y2mos, y3mos] - y3, 2).mean(axis=0)
l4 = np.power(np.c_[y1mos, y2mos, y3mos] - y4, 2).mean(axis=0)
l5 = np.power(np.c_[y1mos, y2mos, y3mos] - y5, 2).mean(axis=0)
l6 = np.power(np.c_[y1mos, y2mos, y3mos] - y6, 2).mean(axis=0)

print(" Complexity | Understandability | Lexical")
print("  semantic: " + '  '.join([f"{x:5.3f}" for x in l1 - l0]))
print("    syntax: " + '  '.join([f"{x:5.3f}" for x in l2 - l0]))
print("  pos tags: " + '  '.join([f"{x:5.3f}" for x in l3 - l0]))
print("morph tags: " + '  '.join([f"{x:5.3f}" for x in l4 - l0]))
print(" phonetics: " + '  '.join([f"{x:5.3f}" for x in l5 - l0]))
print(" morphemes: " + '  '.join([f"{x:5.3f}" for x in l6 - l0]))

 Complexity | Understandability | Lexical
  semantic: 0.694  0.986  0.996
    syntax: 2.273  1.333  1.138
  pos tags: 0.103  0.089  0.066
morph tags: 0.085  0.172  0.085
 phonetics: 0.301  0.191  0.360
 morphemes: 0.130  0.313  0.146


### Correlations

In [11]:
np.corrcoef(np.c_[y1mos, y2mos, y3mos], rowvar=False)

array([[1.        , 0.89598348, 0.90518689],
       [0.89598348, 1.        , 0.93461015],
       [0.90518689, 0.93461015, 1.        ]])

In [12]:
np.corrcoef(y0, rowvar=False)

array([[1.        , 0.8985984 , 0.90813378],
       [0.8985984 , 1.        , 0.93350992],
       [0.90813378, 0.93350992, 1.        ]])

In [13]:
# semantic
(np.corrcoef(y1, rowvar=False) - np.corrcoef(y0, rowvar=False)).round(4)

array([[ 0.    , -0.012 ,  0.0289],
       [-0.012 ,  0.    , -0.0429],
       [ 0.0289, -0.0429,  0.    ]])

In [14]:
# syntactic (node vs token distance)
(np.corrcoef(y2, rowvar=False) - np.corrcoef(y0, rowvar=False)).round(4)

array([[ 0.    , -0.1709, -0.1549],
       [-0.1709,  0.    , -0.1442],
       [-0.1549, -0.1442,  0.    ]])

In [15]:
# pos tags
(np.corrcoef(y3, rowvar=False) - np.corrcoef(y0, rowvar=False)).round(4)

array([[ 0.    , -0.0213, -0.0211],
       [-0.0213,  0.    , -0.0254],
       [-0.0211, -0.0254, -0.    ]])

In [16]:
# morph tags
(np.corrcoef(y4, rowvar=False) - np.corrcoef(y0, rowvar=False)).round(4)

array([[ 0.    , -0.0177, -0.0232],
       [-0.0177,  0.    , -0.0258],
       [-0.0232, -0.0258,  0.    ]])

In [17]:
# phonetics (consonant clusters)
(np.corrcoef(y5, rowvar=False) - np.corrcoef(y0, rowvar=False)).round(4)

array([[ 0.    , -0.0337, -0.0453],
       [-0.0337,  0.    , -0.03  ],
       [-0.0453, -0.03  , -0.    ]])

In [18]:
# morphemes/lexemes
(np.corrcoef(y6, rowvar=False) - np.corrcoef(y0, rowvar=False)).round(4)

array([[ 0.    , -0.0271, -0.0232],
       [-0.0271,  0.    , -0.0259],
       [-0.0232, -0.0259,  0.    ]])