## Explorative Analysis
- [x] preprocess text data with `preprocess.py`

In [1]:
import numpy as np
import pandas as pd
import scipy.stats

### Load data

In [2]:
with open('data/preprocessed.npy', 'rb') as fp:
    feats1 = np.load(fp)
    feats2 = np.load(fp)
    feats3 = np.load(fp)
    feats4 = np.load(fp)
    feats5 = np.load(fp)
    feats6 = np.load(fp)
    feats7 = np.load(fp)
    feats8 = np.load(fp)
    feats9 = np.load(fp)

In [3]:
(
    feats1.shape, feats2.shape, feats3.shape, feats4.shape, feats5.shape, 
    feats6.shape, feats7.shape, feats8.shape, feats9.shape
)

((1000, 384),
 (1000, 21),
 (1000, 16),
 (1000, 47),
 (1000, 3),
 (1000, 20),
 (1000, 6),
 (1000, 2),
 (1000, 4))

In [4]:
df = pd.read_csv("data/ratings.csv", encoding="ISO-8859-1")
texts = df["Sentence"].values
y1mos = df["MOS_Complexity"].values
y1std = df["Std_Complexity"].values
y2mos = df["MOS_Understandability"].values
y2std = df["Std_Understandability"].values
y3mos = df["MOS_Lexical_difficulty"].values
y3std = df["Std_Lexical_difficulty"].values

In [5]:
y_train = np.c_[y1mos, y2mos, y3mos]
# y_train = np.c_[y1mos, y2mos, y3mos, y1std, y2std, y3std]  # not good

### Feature Importance : Random Forest

In [6]:
import sklearn.ensemble

model = sklearn.ensemble.RandomForestRegressor(
    n_estimators=100,
    max_depth=16,
    min_samples_leaf=10,
    bootstrap=True, oob_score=True, max_samples=0.5,
    random_state=42
)

In [7]:
np.random.seed(42)

xinputs = np.hstack([
    feats1,
    feats2, feats3, feats4, feats5, feats6, feats7, feats8, feats9,
    np.random.random((feats1.shape[0], 1))
])

# all
xnames = [f"sbert_{j}" for j in range(feats1.shape[1])] \
    + [f"nodedist_{j}" for j in range(feats2.shape[1])] \
    + [f"postag_{j}" for j in range(feats3.shape[1])] \
    + [f"morphtag_{j}" for j in range(feats4.shape[1])] \
    + [f"consonant_{j}" for j in range(feats5.shape[1])] \
    + [f"lexemes_{j}" for j in range(feats6.shape[1])] \
    + [f"frequency_{j}" for j in range(feats7.shape[1])] \
    + [f"length_{j}" for j in range(feats8.shape[1])] \
    + [f"bigram_{j}" for j in range(feats9.shape[1])] \
    + ["RANDOM"]

# without SBert
# xnames = [f"nodedist_{j}" for j in range(feats2.shape[1])] \
#     + [f"postag_{j}" for j in range(feats3.shape[1])] \
#     + [f"morphtag_{j}" for j in range(feats4.shape[1])] \
#     + [f"consonant_{j}" for j in range(feats5.shape[1])] \
#     + [f"lexemes_{j}" for j in range(feats6.shape[1])] \
#     + [f"frequency_{j}" for j in range(feats7.shape[1])] \
#     + [f"length_{j}" for j in range(feats8.shape[1])] \
#     + [f"bigram_{j}" for j in range(feats9.shape[1])] \
#     + ["RANDOM"]

# SBert Only
# xnames = [f"sbert_{j}" for j in range(feats1.shape[1])] + ["RANDOM"]


In [8]:
%%time
model.fit(X=xinputs, y=y_train)

CPU times: user 8.87 s, sys: 24 ms, total: 8.9 s
Wall time: 8.93 s


RandomForestRegressor(max_depth=16, max_samples=0.5, min_samples_leaf=10,
                      oob_score=True, random_state=42)

In [9]:
# R^2 score (1.0 is best)
print(f"R^2: {model.score(X=xinputs, y=y_train)}")

y_pred = model.predict(xinputs)
print(f"RMSE all: {np.power(y_pred - y_train, 2).mean()}")
print(f"RMSE y_1: {np.power(y_pred[:, 0] - y_train[:, 0], 2).mean()}")

R^2: 0.7563259445026507
RMSE all: 0.32705008899765414
RMSE y_1: 0.28836883243544986


In [10]:
df_fi = pd.DataFrame(index=xnames, data=model.feature_importances_, columns=["fi"])
df_fi = df_fi.sort_values(by="fi", ascending=False)
cutoff = df_fi.loc["RANDOM"].values[0]

In [11]:
selected = df_fi[df_fi["fi"] > cutoff]
# num = len([c for c in selected.index if "sbert_" in c]) 
num = len(selected)
num, num / len(df_fi)

(437, 0.8670634920634921)

In [12]:
dropped = df_fi[df_fi["fi"] <= cutoff]
# num = len([c for c in dropped.index if "sbert_" in c])
num = len(dropped)
num, num / len(df_fi)

(67, 0.13293650793650794)

In [13]:
model2 = sklearn.ensemble.RandomForestRegressor(
    n_estimators=100,
    max_depth=16,
    min_samples_leaf=10,
    bootstrap=True, oob_score=True, max_samples=0.5,
    random_state=42
)

mask = (df_fi["fi"] > cutoff).values
model2.fit(X=xinputs[:, mask], y=y_train)

print(f"R^2: {model2.score(X=xinputs[:, mask], y=y_train)}")
y_pred = model2.predict(xinputs[:, mask])
print(f"RMSE all: {np.power(y_pred - y_train, 2).mean()}")
print(f"RMSE y_1: {np.power(y_pred[:, 0] - y_train[:, 0], 2).mean()}")

R^2: 0.7208961656463994
RMSE all: 0.37524443863173307
RMSE y_1: 0.34186694269033346


In [14]:
print(f"{100 * len([c for c in selected.index if 'sbert_' in c]) / feats1.shape[1] : 7.2f}%")
print(f"{100 * len([c for c in selected.index if 'nodedist_' in c]) / feats2.shape[1] : 7.2f}%")
print(f"{100 * len([c for c in selected.index if 'postag_' in c]) / feats3.shape[1] : 7.2f}%")
print(f"{100 * len([c for c in selected.index if 'morphtag_' in c]) / feats4.shape[1] : 7.2f}%")
print(f"{100 * len([c for c in selected.index if 'consonant_' in c]) / feats5.shape[1] : 7.2f}%")
print(f"{100 * len([c for c in selected.index if 'lexemes_' in c]) / feats6.shape[1] : 7.2f}%")
print(f"{100 * len([c for c in selected.index if 'frequency_' in c]) / feats7.shape[1] : 7.2f}%")
print(f"{100 * len([c for c in selected.index if 'length_' in c]) / feats8.shape[1] : 7.2f}%")
print(f"{100 * len([c for c in selected.index if 'bigram_' in c]) / feats9.shape[1] : 7.2f}%")

  93.49%
  80.95%
  68.75%
  40.43%
 100.00%
  80.00%
 100.00%
 100.00%
 100.00%


Results [y1,y2,3]
- [x] all: r2=0.7563259445026507, selected=437, dropped=67, reduced=0.7208961656463994
- [x] Without SBert: r2=0.7078522887174549, selected=51, dropped=69, reduced=0.6226863472323011
- [x] SBert Only: r2=0.6136377682314099, selected=214, dropped=171, reduced=0.5900620624883309

Results RMSE y_1
- [x] without std: RMSE=0.28836883243544986, reduced=0.34186694269033346
- [x] with std: RMSE=0.29265417823113765, reduced=0.575173784933369

In [15]:
# string length is the single most important feature
# selected.head(10)

In [16]:
mask = [s for s in selected.index if "sbert" not in s]
selected.loc[mask].head(10)

Unnamed: 0,fi
length_1,0.604292
frequency_2,0.015181
bigram_0,0.005832
postag_11,0.005643
length_0,0.004229
morphtag_37,0.003366
bigram_2,0.002851
bigram_1,0.002799
bigram_3,0.002039
postag_14,0.001793


In [17]:
# [c for c in selected.index if 'frequency_' in c]
# plt.hist(feats8[:,0]);

In [18]:
mask = ["consonant" in i or "RANDOM" == i for i in df_fi.index]
df_fi[mask]

Unnamed: 0,fi
consonant_2,0.000701
consonant_1,0.00042
consonant_0,0.000283
RANDOM,9.3e-05


In [19]:
mask = ["length" in i or "RANDOM" == i for i in df_fi.index]
df_fi[mask]

Unnamed: 0,fi
length_1,0.604292
length_0,0.004229
RANDOM,9.3e-05


In [20]:
mask = ["bigram" in i or "RANDOM" == i for i in df_fi.index]
df_fi[mask]

Unnamed: 0,fi
bigram_0,0.005832
bigram_2,0.002851
bigram_1,0.002799
bigram_3,0.002039
RANDOM,9.3e-05


In [21]:
# mask = ["lexeme" in i or "RANDOM" == i for i in df_fi.index]
# df_fi[mask]

In [22]:
# mask = ["morphtag" in i or "RANDOM" == i for i in df_fi.index]
# df_fi[mask]

In [23]:
# MORPHTAGS = ['PunctType=Brck', 'PunctType=Comm', 'PunctType=Peri', 'AdpType=Post', 'AdpType=Prep', 'AdpType=Circ', 'PartType=Res', 'PartType=Vbp', 'PartType=Inf', 'PronType=Art', 'PronType=Dem', 'PronType=Ind', 'PronType=Prs', 'PronType=Rel', 'PronType=Int', 'ConjType=Comp', 'Foreign=Yes', 'Hyph=Yes', 'NumType=Card', 'Polarity=Neg', 'Poss=Yes', 'Reflex=Yes', 'Variant=Short', 'VerbForm=Fin', 'VerbForm=Inf', 'VerbForm=Part', 'Mood=Ind', 'Mood=Imp', 'Aspect=Perf', 'VerbType=Mod', 'Gender=Fem', 'Gender=Masc', 'Gender=Neut', 'Number=Sing', 'Number=Plur', 'Person=1', 'Person=2', 'Person=3', 'Case=Nom', 'Case=Dat', 'Case=Gen', 'Case=Acc', 'Degree=Pos', 'Degree=Cmp', 'Degree=Sup', 'Tense=Pres', 'Tense=Past']
# idx = [int(c.split("_")[1]) for c in dropped.index if 'morphtag_' in c]
# np.array(MORPHTAGS)[idx]