In [1]:
import sent2vec
from nltk.tokenize import TweetTokenizer
import numpy as np
import random
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import re
from sklearn.metrics.pairwise import cosine_similarity as cos
from scipy.stats import pearsonr, spearmanr

In [2]:
# load sentence embedding model
model_path = '/Users/sakoju/Documents/10715/Fall2018/Project/sent2vec/pre-trained-models/'
model_wi_1 = sent2vec.Sent2vecModel()
model_wi_1.load_model(model_path + 'wiki_bigrams.bin')

In [4]:
def block_embedding(sents, model):
    """
    sents: array, n sentences,
    model: embedding model
    """
    tknzr = TweetTokenizer()
    n = len(sents)
    tokenized_sents = []
    for i in range (n):
        tokenized_sents.append(' '.join(tknzr.tokenize(sents[i])).lower())
    emb = model.embed_sentences(tokenized_sents)
    return emb

In [5]:
# load movie reviews and preprocessing
mr_file_neg = open('/Users/sakoju/Documents/10715/Fall2018/Project/sent2vec/datasets/rt-polarity.neg', encoding="latin-1")
mr_sent_neg = mr_file_neg.readlines()
mr_file_neg.close()
mr_file_pos = open('/Users/sakoju/Documents/10715/Fall2018/Project/sent2vec/datasets/rt-polarity.pos', encoding="latin-1")
mr_sent_pos = mr_file_pos.readlines()
mr_file_pos.close()
mr_sent_neg = np.array(mr_sent_neg)
mr_sent_pos = np.array(mr_sent_pos)
mr_y_neg = np.zeros_like(mr_sent_neg)
for i in range(len(mr_y_neg)):
    mr_y_neg[i] = 0
mr_y_pos = np.ones_like(mr_sent_pos)
mr_sent = np.concatenate((mr_sent_pos, mr_sent_neg))
mr_y = np.concatenate((mr_y_pos, mr_y_neg))

random.seed(2)
random.shuffle(mr_sent)
random.seed(2)
random.shuffle(mr_y)

In [6]:
# embedding movie review
mr_x = block_embedding(mr_sent, model_wi_1)

In [7]:
# logistic regression to classify the movie review
def nestedCV(X, Y, Cs, innercv, outercv):
    """
    Nested Cross Validation to select the best hyperparameters
    and evaluate the logistic regression model.
    :param X: n by d array, input features
    :param Y: n by 1 array, labels
    :param Cs: List or Array of candidates parameters for penalty in LR
    :param innercv: int, fold of the inner cross validation
    :param outercv: int, fold of the outer cross validation
    :return: average score of cross validation
    """
    clf_inner = GridSearchCV(estimator=LogisticRegression(), param_grid=Cs, cv=innercv)
    clf_inner.fit(X, Y)
    C_best = clf_inner.best_params_['C']
    clf_outer = LogisticRegression(C=C_best)
    scores = cross_val_score(clf_outer, X, Y, cv=outercv)
    return scores.mean()

In [8]:
# classify the movie reviews and see the accuracy
sc = StandardScaler()
mr_x_std = sc.fit_transform(mr_x)

# create penalty coefficients candidates in logistic regression
C_candidates = dict(C=np.arange(5, 10, 1))

# nested CV for logistic regression
score = nestedCV(mr_x_std, mr_y, C_candidates, 3, 3)
print(score)



0.7597073719752392


In [11]:
# evaluate STS using cosine similarity
# and compare the results with the gold standard.
# sentsets: sentence datasets:
#           deft-forum, deft-news, headlines, images, OnWM, tweet-news
def STS_eval(sentset, model):
    """
    Evaluate the similarities of 
    :param sentset: string, sentence dataset
    :param model: sentence embedding model
    :return: cosine similarity, of all pairs of sentences
             pearson & spearman coefficients compared to gold standard
    """
    sent_file = open('sts-en-test-gs-2014/STS.input.'+sentset+'.txt')
    sent_data = sent_file.readlines()
    sent_file.close()
    gs_file = open('sts-en-test-gs-2014/STS.gs.'+sentset+'.txt')
    gs_data = np.array(gs_file.readlines(), dtype=float)
    gs_file.close()
    splited_sent = []
    n = len(sent_data)
    for i in range(n):
        splited_sent.append(re.split(r'\t+', sent_data[i]))
    splited_sent = np.array(splited_sent)
    sent_1 = splited_sent[:,0]
    sent_2 = splited_sent[:,1]
    x_1 = block_embedding(sent_1, model)
    x_2 = block_embedding(sent_2, model)
    cosine = []

    for i in range(n):
        v1 = x_1[i]
        v2 = x_2[i]
        cos_i = cos([v1], [v2])
        cosine.append(cos_i[0][0])
    
    print(np.shape(cosine))
    print(np.shape(gs_data))
    pearson = pearsonr(cosine, gs_data)
    spearman = spearmanr(cosine, gs_data)
    
    return cosine, gs_data, pearson, spearman

In [12]:
cos_news, gs_news, pearson_news, spearman_news = STS_eval('deft-news', model_wi_1)

print('spearman correlation with gs:',  spearman_news[0])
print('pearson correlation with gs:', pearson_news[0])

(300,)
(300,)
spearman correlation with gs: 0.648515617391334
pearson correlation with gs: 0.704973999525268
