In [1]:
import numpy as np
import pandas as pd

from data_loader import load_sts_train
from features import lexical_features, syntactic_features, combined_features
from models import (
    build_feature_matrix, train_val_split,
    make_ridge_model, make_svr_model, make_rf_model,
    fit_and_eval
)

train_df = load_sts_train("./train")  # adjust path
display(train_df.head())
print(train_df["source"].value_counts())


Unnamed: 0,source,s1,s2,score
0,MSRpar,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...,4.0
1,MSRpar,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...,3.75
2,MSRpar,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl...",2.8
3,MSRpar,"The American Anglican Council, which represent...","The American Anglican Council, which represent...",3.4
4,MSRpar,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....,2.4


source
MSRpar         750
MSRvid         750
SMTeuroparl    734
Name: count, dtype: int64


In [2]:
# Lexical-only
X_lex, y = build_feature_matrix(train_df, lexical_features)

# Syntactic-only
X_syn, _ = build_feature_matrix(train_df, syntactic_features)

# Combined
X_comb, _ = build_feature_matrix(train_df, combined_features)


In [3]:
X_lex_tr, X_lex_val, y_tr, y_val = train_val_split(X_lex, y)
X_syn_tr, X_syn_val, _, _       = train_val_split(X_syn, y)
X_comb_tr, X_comb_val, _, _     = train_val_split(X_comb, y)


In [4]:
# Lexical-only

ridge_lex = make_ridge_model()
svr_lex   = make_svr_model()
rf_lex    = make_rf_model()

best_ridge_lex, p_ridge_lex = fit_and_eval(ridge_lex, X_lex_tr, y_tr, X_lex_val, y_val, "Ridge-lex")
best_svr_lex,   p_svr_lex   = fit_and_eval(svr_lex,   X_lex_tr, y_tr, X_lex_val, y_val, "SVR-lex")
best_rf_lex,    p_rf_lex    = fit_and_eval(rf_lex,    X_lex_tr, y_tr, X_lex_val, y_val, "RF-lex")


[Ridge-lex] best params: {'model__alpha': 0.1}
[Ridge-lex] Val Pearson: 0.6377
[SVR-lex] best params: {'model__C': 10.0, 'model__epsilon': 0.2, 'model__gamma': 'scale'}
[SVR-lex] Val Pearson: 0.7157
[RF-lex] best params: {'max_depth': 10, 'min_samples_leaf': 3, 'n_estimators': 300}
[RF-lex] Val Pearson: 0.7395


In [5]:
#Syntactic-only
ridge_syn = make_ridge_model()
svr_syn   = make_svr_model()
rf_syn    = make_rf_model()

best_ridge_syn, p_ridge_syn = fit_and_eval(ridge_syn, X_syn_tr, y_tr, X_syn_val, y_val, "Ridge-syn")
best_svr_syn,   p_svr_syn   = fit_and_eval(svr_syn,   X_syn_tr, y_tr, X_syn_val, y_val, "SVR-syn")
best_rf_syn,    p_rf_syn    = fit_and_eval(rf_syn,    X_syn_tr, y_tr, X_syn_val, y_val, "RF-syn")


[Ridge-syn] best params: {'model__alpha': 10.0}
[Ridge-syn] Val Pearson: 0.2954
[SVR-syn] best params: {'model__C': 1.0, 'model__epsilon': 0.2, 'model__gamma': 'scale'}
[SVR-syn] Val Pearson: 0.5013
[RF-syn] best params: {'max_depth': 20, 'min_samples_leaf': 3, 'n_estimators': 300}
[RF-syn] Val Pearson: 0.5923


In [6]:
#Combined
ridge_comb = make_ridge_model()
svr_comb   = make_svr_model()
rf_comb    = make_rf_model()

best_ridge_comb, p_ridge_comb = fit_and_eval(ridge_comb, X_comb_tr, y_tr, X_comb_val, y_val, "Ridge-comb")
best_svr_comb,   p_svr_comb   = fit_and_eval(svr_comb,   X_comb_tr, y_tr, X_comb_val, y_val, "SVR-comb")
best_rf_comb,    p_rf_comb    = fit_and_eval(rf_comb,    X_comb_tr, y_tr, X_comb_val, y_val, "RF-comb")


[Ridge-comb] best params: {'model__alpha': 1.0}
[Ridge-comb] Val Pearson: 0.6970
[SVR-comb] best params: {'model__C': 1.0, 'model__epsilon': 0.2, 'model__gamma': 'scale'}
[SVR-comb] Val Pearson: 0.7331
[RF-comb] best params: {'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 300}
[RF-comb] Val Pearson: 0.7704


In [7]:
results = pd.DataFrame({
    "model": [
        "Ridge-lex","SVR-lex","RF-lex",
        "Ridge-syn","SVR-syn","RF-syn",
        "Ridge-comb","SVR-comb","RF-comb"
    ],
    "val_pearson": [
        p_ridge_lex, p_svr_lex, p_rf_lex,
        p_ridge_syn, p_svr_syn, p_rf_syn,
        p_ridge_comb, p_svr_comb, p_rf_comb
    ]
}).sort_values("val_pearson", ascending=False)

display(results)


Unnamed: 0,model,val_pearson
8,RF-comb,0.770361
2,RF-lex,0.739545
7,SVR-comb,0.73307
1,SVR-lex,0.715694
6,Ridge-comb,0.697038
0,Ridge-lex,0.637704
5,RF-syn,0.5923
4,SVR-syn,0.501339
3,Ridge-syn,0.295447


In [8]:
# Rebuild full feature matrices (no split)
X_lex_full, y_full  = build_feature_matrix(train_df, lexical_features)
X_syn_full, _       = build_feature_matrix(train_df, syntactic_features)
X_comb_full, _      = build_feature_matrix(train_df, combined_features)

# Rebuild the chosen model configs
svr_lex_full  = make_svr_model()
rf_syn_full   = make_rf_model()
svr_comb_full = make_svr_model()

# Fit on all training data
svr_lex_full.fit(X_lex_full, y_full)
rf_syn_full.fit(X_syn_full, y_full)
svr_comb_full.fit(X_comb_full, y_full)

best_lex_model  = svr_lex_full.best_estimator_
best_syn_model  = rf_syn_full.best_estimator_
best_comb_model = svr_comb_full.best_estimator_


3. Test time: evaluate all three systems
Case A – you have gold scores for test (e.g. for analysis, not submission)

In [9]:
from data_loader import load_sts_test_with_gs
from features import lexical_features, syntactic_features, combined_features
from models import build_feature_matrix
from eval_utils import pearson_corr

# 1) Load test with gold
test_df = load_sts_test_with_gs("test-gold")

# 2) Build feature matrices + gold
X_lex_test, y_test  = build_feature_matrix(test_df, lexical_features,  has_score=True)
X_syn_test, _       = build_feature_matrix(test_df, syntactic_features, has_score=True)
X_comb_test, _      = build_feature_matrix(test_df, combined_features, has_score=True)

# 3) Predict with your three fully-trained models
y_pred_lex  = best_lex_model.predict(X_lex_test)
y_pred_syn  = best_syn_model.predict(X_syn_test)
y_pred_comb = best_comb_model.predict(X_comb_test)

# 4) Evaluate
print("Test Pearson - lexical:   ", pearson_corr(y_test, y_pred_lex))
print("Test Pearson - syntactic: ", pearson_corr(y_test, y_pred_syn))
print("Test Pearson - combined:  ", pearson_corr(y_test, y_pred_comb))


Test Pearson - lexical:    0.7310691492416485
Test Pearson - syntactic:  0.3487595458918245
Test Pearson - combined:   0.7594571355899375


Case B – test has no golds (typical submission setting)

In [10]:
from data_loader import load_sts_test
from features import lexical_features, syntactic_features, combined_features
from models import build_feature_matrix

# 1) Load test inputs only
test_nolabel_df = load_sts_test("test-gold")   # columns: source, id, s1, s2

# 2) Build feature matrices (has_score=False)
X_lex_test  = build_feature_matrix(test_nolabel_df, lexical_features,  has_score=False)
X_syn_test  = build_feature_matrix(test_nolabel_df, syntactic_features, has_score=False)
X_comb_test = build_feature_matrix(test_nolabel_df, combined_features, has_score=False)

# 3) Predict
test_nolabel_df["pred_lex"]  = best_lex_model.predict(X_lex_test)
test_nolabel_df["pred_syn"]  = best_syn_model.predict(X_syn_test)
test_nolabel_df["pred_comb"] = best_comb_model.predict(X_comb_test)


In [11]:
for src in ["MSRpar", "MSRvid", "SMTeuroparl"]:
    subset = test_nolabel_df[test_nolabel_df["source"] == src]
    out_path = f"STS.output.{src}.mySystem.txt"

    # Just scores, one per line, no header/index
    subset["pred_comb"].to_csv(
        out_path,
        sep="\t",
        header=False,
        index=False
    )

    print("Wrote:", out_path)


Wrote: STS.output.MSRpar.mySystem.txt
Wrote: STS.output.MSRvid.mySystem.txt
Wrote: STS.output.SMTeuroparl.mySystem.txt
