# Informal Word Identification (IWI)

The IWI component identifies each word as informal, or not. The system only attempts to paraphrase only the informal words in the rest of the pipeline.

We derive our dataset from a lexical substitution dataset called Concepts in Context (CoInCo) (Kremer et al., 2014). The CoInCo dataset is a All-Words lexical substitution dataset, where all words that could be substituted are manually annotated. The corpus is sampled from newswire and fiction genres of the Manually Annotated Sub-Corpus (MASC) corpus. While the targets (words that are going to be substituted) are used to build the informal word identification dataset, the candidates are further processed to perform the academic paraphrase ranking task. A total of 1,608 train and 866 test sentences are compiled out of 2,474 sentences from the CoInCo dataset.

We automatically generated an IWI dataset from CoInCo dataset as follows. For each non-academic target word, we determine if its substitution candidates include at least one academic word. If so, it is labelled as formal. All academic target words and all words without substitution are labelled as formal. 

In [None]:
import random, pickle, re
from collections import Counter
import xml.etree.ElementTree as ET

import pandas as pd
import numpy as np

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import spacy
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
np.random.seed(42) # to replicate the results
nlp = spacy.load('en_core_web_sm')

specify the paths to the following resources

In [None]:
CoInCo = '<path-to-coinco.xml>'
COCA_ALL = 'path-to-(COCA)allWords.xlsx>'
COMPILED_LIST = '<path-to-academic_keyphrases.xlsx>'
COCA_LIST = '<path-to-(COCA)acadCore.xlsx>'
NAWL = '<path-to-NAWL_Headwords.txt>'
ACL_FREQ = '<path-to-academic_unigrams.pkl>' # obained while compiling the resources
BEAUTIFUL_DATA = '<path-to-(beautiful_data)count_1w.txt>'
GLOVE_PATH = 'path-to-glove.840B.300d.txt'

PRECONTEXT = 0
TARGETSENTENCE = 1
POSTCONTEXT = 2
TOKENS = 3

parse the XML CoInCo dataset

In [None]:
tree = ET.parse(CoInCo)
root = tree.getroot()

In [None]:
sentences = list()
for child in root:
    sentences.append(child[TARGETSENTENCE].text.strip())

perform train test split over the list of available sentences in the CoInCo dataset

In [None]:
random.Random(9).shuffle(sentences)
train_sentences = sentences[ : int(0.65 * len(sentences))]
test_sentences = sentences[int(0.65 * len(sentences)) : ]

In [None]:
t_sentences = train_sentences[ : int(0.8 * len(train_sentences))]
v_sentences = train_sentences[int(0.8 * len(train_sentences)) : ]

split the train and test sentences from the CoInCo dataset

In [None]:
t_d = dict()
v_d = dict()
test_d = dict()

for child in root:
    for token in child[TOKENS]:
        token_id = token.get('id')
        t = dict()
        t['precontenxt'] = child[PRECONTEXT].text.strip()
        t['postcontext'] = child[POSTCONTEXT].text.strip()
        t['wordform'] = token.get('wordform')
        t['lemma'] = token.get('lemma')
        t['posMASC'] = token.get('posMASC')
        t['posTT'] = token.get('posTT')
        t['problematic'] = token.get('problematic')
        l = list()
        for substitutions in token:
            for subst in substitutions:
                s = (subst.get('lemma'), subst.get('pos'), subst.get('freq'))
                l.append(s)
        t['substitutions'] = l
        
        if(token_id != 'XXX' and (child[TARGETSENTENCE].text.strip() in t_sentences)):
            t['targetsentence'] = child[TARGETSENTENCE].text.strip()
            t_d[token_id] = t
        elif(token_id != 'XXX' and (child[TARGETSENTENCE].text.strip() in v_sentences)):
            t['targetsentence'] = child[TARGETSENTENCE].text.strip()
            v_d[token_id] = t
        elif(token_id != 'XXX' and (child[TARGETSENTENCE].text.strip() in test_sentences)):
            t['targetsentence'] = child[TARGETSENTENCE].text.strip()
            test_d[token_id] = t

In [None]:
train_d = dict()
for t in t_d:
    train_d[t] = t_d[t]
for t in v_d:
    train_d[t] = v_d[t]

In [None]:
len(t_d), len(v_d), len(train_d), len(test_d)

load the compiled list of academic phrases

In [None]:
academic_df = pd.read_excel(COMPILED_LIST, sheet_name='<sheet-name>')
academic_list = academic_df.phrase.tolist()

load COCA academic list

In [None]:
coca_df = pd.read_excel(COCA_LIST, sheet_name='list')
coca_list = coca_df.word.tolist()

load NAWL list

In [None]:
with open(NAWL, 'r') as f:
    s = f.read()
    nawl_list = s.split()

load GloVe vectors file

In [None]:
glove_file = datapath(GLOVE_PATH)
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
with open(ACL_FREQ, 'rb') as f:
    acl_freq = pickle.load(f)

load Beatiful Data corpus

In [None]:
beatiful_data_freq = Counter()
with open(BEAUTIFUL_DATA, 'r') as f:
    tmp = f.read().strip().split('\n')
    for c in tmp:
        word, freq = c.strip().split('\t')
        beatiful_data_freq[word] = freq

load COCA all words list

In [None]:
allwords_df = pd.read_excel(COCA_ALL, sheet_name='list')

identify the formal and informal words

In [None]:
informal_t_d = dict()
non_informal_t_d = dict()
for token_id in t_d:
    lemma = t_d[token_id]['lemma']
    if(lemma not in coca_list or lemma not in nawl_list or lemma not in academic_list):
        c = 0
        for subst in t_d[token_id]['substitutions']:
            s = subst[0]
            if(s in coca_list or s in nawl_list):
                informal_t_d[token_id] = t_d[token_id]
                c = 1
                break
        if(c == 0):
            non_informal_t_d[token_id] = t_d[token_id]
    else:
        non_informal_t_d[token_id] = t_d[token_id]

identify the formal and informal words

In [None]:
informal_v_d = dict()
non_informal_v_d = dict()
for token_id in v_d:
    lemma = v_d[token_id]['lemma']
    if(lemma not in coca_list or lemma not in nawl_list or lemma not in academic_list):
        c = 0
        for subst in v_d[token_id]['substitutions']:
            s = subst[0]
            if(s in coca_list or s in nawl_list):
                informal_v_d[token_id] = v_d[token_id]
                c = 1
                break
        if(c == 0):
            non_informal_v_d[token_id] = v_d[token_id]
    else:
        non_informal_v_d[token_id] = v_d[token_id]

identify the formal and informal words

In [None]:
informal_train_d = dict()
non_informal_train_d = dict()
for token_id in informal_t_d:
    informal_train_d[token_id] = informal_t_d[token_id]
for token_id in non_informal_t_d:
    non_informal_train_d[token_id] = non_informal_t_d[token_id]
for token_id in informal_v_d:
    informal_train_d[token_id] = informal_v_d[token_id]
for token_id in non_informal_v_d:
    non_informal_train_d[token_id] = non_informal_v_d[token_id]

In [None]:
print(len(informal_t_d), len(non_informal_t_d))
print(len(informal_v_d), len(non_informal_v_d))
print(len(informal_train_d), len(non_informal_train_d))

In [None]:
UNK_embed = np.random.rand(300,)

obtain the word embedding

In [None]:
def get_word_embedding(word):
    try:
        return model.wv[word]
    except:
        return UNK_embed

compute the sentence embedding - average of all the word embeddings of the words present in the sentence

In [None]:
def get_sentence_embedding(sentence):
    # remove special characters
    sentence = ' '.join(re.findall(r"[a-zA-Z0-9]+", sentence))

    words_embed = list()
    word_list = sentence.split()
    for word in word_list:
        words_embed.append(get_word_embedding(word))
    
    return np.mean(words_embed, axis=0)

In [None]:
def get_ner_tag(lemma, sentence):
    doc = nlp(sentence)
    for ent in doc.ents:
        if(ent.text == lemma):
            return ent.label_
    return 'UNK'

compute the features from the resources

In [None]:
features = list()
for token_id in informal_t_d:
    t = list()
    lemma = informal_t_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(informal_t_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC_tag
    t.append(informal_t_d[token_id]['posMASC'])
    
    # is_problematic
    t.append(1 if informal_t_d[token_id]['problematic']=='yes' else 0)
    
    # word_length
    t.append(len(lemma))
    
    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))
    
    # y
    t.append(1)
    
    features.append(t)
    
for token_id in non_informal_t_d:
    t = list()
    lemma = non_informal_t_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(non_informal_t_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC
    t.append(non_informal_t_d[token_id]['posMASC'])
    
    # is_problematic
    t.append(1 if non_informal_t_d[token_id]['problematic']=='yes' else 0)
    
    # word_length
    t.append(len(lemma))
    
    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))
    
    # y
    t.append(0)
    
    features.append(t)

In [None]:
# specify the feature columns and the target variable
train_features_cols = ['freq_beautiful', 'freq_coca_general', 'freq_acl', 'cos_target', 'euclidean_distance', 'posMASC', 'is_problematic', 'word_length', 'count_vowel']
y_feature = ['y']
df_t = pd.DataFrame(features, columns=train_features_cols+y_feature)

# shuffle the dataframe
df_t = df_t.sample(frac=1).reset_index(drop=True)

label_encoder = LabelEncoder()
# label encode the POS tag
df_t['posMASC_le'] = label_encoder.fit_transform(df_t.posMASC)

In [None]:
df_t

In [None]:
df_t.shape

compute the features from the resources

In [None]:
features = list()
for token_id in informal_v_d:
    t = list()
    lemma = informal_v_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(informal_v_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC
    t.append(informal_v_d[token_id]['posMASC'])

    # is_problematic
    t.append(1 if informal_v_d[token_id]['problematic']=='yes' else 0)    
    
    # word_length
    t.append(len(lemma)) 

    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))

    # y
    t.append(1)
    
    features.append(t)
    
for token_id in non_informal_v_d:
    t = list()
    lemma = non_informal_v_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(non_informal_v_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC
    t.append(non_informal_v_d[token_id]['posMASC'])

    # is_problematic
    t.append(1 if non_informal_v_d[token_id]['problematic']=='yes' else 0)
    
    # word_length
    t.append(len(lemma))
    
    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))
    
    # y
    t.append(0)
    
    features.append(t)

In [None]:
df_val = pd.DataFrame(features, columns=train_features_cols+y_feature)

# shuffle the dataframe
df_val = df_val.sample(frac=1).reset_index(drop=True)

# label encode the POS tag
df_val['posMASC_le'] = label_encoder.fit_transform(df_val.posMASC)

In [None]:
df_val

In [None]:
df_val.shape

compute the features from the resources

In [None]:
features = list()
for token_id in informal_train_d:
    t = list()
    lemma = informal_train_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(informal_train_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC_tag
    t.append(informal_train_d[token_id]['posMASC'])
    
    # is_problematic
    t.append(1 if informal_train_d[token_id]['problematic']=='yes' else 0)
    
    # word_length
    t.append(len(lemma))
    
    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))
    
    # y
    t.append(1)
    
    features.append(t)
    
for token_id in non_informal_train_d:
    t = list()
    lemma = non_informal_train_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(non_informal_train_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC
    t.append(non_informal_train_d[token_id]['posMASC'])
    
    # is_problematic
    t.append(1 if non_informal_train_d[token_id]['problematic']=='yes' else 0)
    
    # word_length
    t.append(len(lemma))
    
    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))
    
    # y
    t.append(0)
    
    features.append(t)

In [None]:
df_train = pd.DataFrame(features, columns=train_features_cols+y_feature)

# shuffle the dataframe
df_train = df_train.sample(frac=1).reset_index(drop=True)

# label encode the POS tag
df_train['posMASC_le'] = label_encoder.fit_transform(df_train.posMASC)

In [None]:
df_train

In [None]:
df_train.shape

In [None]:
# update the train features to perform the various experiments
train_features = ['freq_beautiful', 'freq_coca_general', 'freq_acl', 'cos_target', 'euclidean_distance', 'posMASC_le', 'is_problematic', 'word_length', 'count_vowel']

Classifers (provided by scikit learn) to perform Informal Word Identification (IWI)

In [None]:
clf = LogisticRegression()
clf.fit(df_train[train_features], df_train[y_feature])

In [None]:
clf = GaussianNB()
clf.fit(df_train[train_features], df_train[y_feature])

In [None]:
clf = SVC(kernel='rbf')
clf.fit(df_train[train_features], df_train[y_feature])

In [None]:
clf = DecisionTreeClassifier()
clf.fit(df_train[train_features], df_train[y_feature])

In [None]:
clf = RandomForestClassifier(n_estimators=10)
clf.fit(df_train[train_features], df_train[y_feature])

In [None]:
clf = GradientBoostingClassifier(n_estimators=600)
clf.fit(df_train[train_features], df_train[y_feature])

In [None]:
clf

In [None]:
train_features

In [None]:
y_pred = clf.predict(df_val[train_features])

In [None]:
print('Accuracy : ', metrics.accuracy_score(df_val[y_feature], y_pred))
print('Precision : ', metrics.precision_score(df_val[y_feature], y_pred))
print('Recall : ', metrics.recall_score(df_val[y_feature], y_pred))
print('F-Measure : ', metrics.f1_score(df_val[y_feature], y_pred))

In [None]:
precision, recall, thresholds = metrics.precision_recall_curve(df_val[y_feature], y_pred)
f1 = metrics.f1_score(df_val[y_feature], y_pred)
auc = metrics.auc(recall, precision)
ap = metrics.average_precision_score(df_val[y_feature], y_pred)

plt.plot([0, 1], [0.5, 0.5], linestyle='--')
plt.plot(recall, precision, marker='.')
plt.show()

identify the formal and informal words

In [None]:
informal_test_d = dict()
non_informal_test_d = dict()
for token_id in test_d:
    lemma = test_d[token_id]['lemma']
    if(lemma not in coca_list or lemma not in nawl_list or lemma not in academic_list):
        c = 0
        for subst in test_d[token_id]['substitutions']:
            s = subst[0]
            if(s in coca_list or s in nawl_list):
                informal_test_d[token_id] = test_d[token_id]
                c = 1
                break
        if(c == 0):
            non_informal_test_d[token_id] = test_d[token_id]
    else:
        non_informal_test_d[token_id] = test_d[token_id]

In [None]:
print(len(informal_test_d), len(non_informal_test_d))

compute the features from the resources

In [None]:
features = list()
for token_id in informal_test_d:
    t = list()
    lemma = informal_test_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(informal_test_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC
    t.append(informal_test_d[token_id]['posMASC'])
    
    # is_problematic
    t.append(1 if informal_test_d[token_id]['problematic']=='yes' else 0)
    
    # word_length
    t.append(len(lemma))
    
    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))
    
    # y
    t.append(1)
    
    features.append(t)
    
for token_id in non_informal_test_d:
    t = list()
    lemma = non_informal_test_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(non_informal_test_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC
    t.append(non_informal_test_d[token_id]['posMASC'])
    
    # is_problematic
    t.append(1 if non_informal_test_d[token_id]['problematic']=='yes' else 0)
    
    # word_length
    t.append(len(lemma))
    
    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))
    
    # y
    t.append(0)
    
    features.append(t)

In [None]:
df_test = pd.DataFrame(features, columns=train_features_cols+y_feature)

# shuffle the dataframe
df_test = df_test.sample(frac=1).reset_index(drop=True)

# label encode the POS tag
df_test['posMASC_le'] = label_encoder.fit_transform(df_test.posMASC)

In [None]:
df_test

In [None]:
df_test.shape

In [None]:
clf

In [None]:
train_features

In [None]:
y_pred = clf.predict(df_test[train_features])

In [None]:
print('Accuracy : ', metrics.accuracy_score(df_test[y_feature], y_pred))
print('Precision : ', metrics.precision_score(df_test[y_feature], y_pred))
print('Recall : ', metrics.recall_score(df_test[y_feature], y_pred))
print('F-Measure : ', metrics.f1_score(df_test[y_feature], y_pred))

compute the features from the resources

In [None]:
features = list()
for token_id in informal_test_d:
    t = list()
    lemma = informal_test_d[token_id]['lemma']
    # freq_beautiful_data
    try:
        t.append(beatiful_data_freq[lemma])
    except:
        t.append(0)
    
    # freq_coca_general
    try:
        t.append(allwords_df.loc[allwords_df['word'] == lemma].iloc[0]['COCA-All'])
    except:
        t.append(0)
    
    # freq_acl
    try:
        t.append(acl_freq[tuple(lemma.split())])
    except:
        t.append(0)
    
    # cos_target
    sentence_embed = get_sentence_embedding(informal_test_d[token_id]['targetsentence'])
    word_embed = get_word_embedding(lemma)
    t.append(np.dot(sentence_embed, word_embed)/(np.sqrt(np.dot(sentence_embed, sentence_embed))*np.sqrt(np.dot(word_embed, word_embed))))
    
    # euclidean_distance
    t.append(np.linalg.norm(sentence_embed-word_embed))
    
    # posMASC
    t.append(informal_test_d[token_id]['posMASC'])
    
    # is_problematic
    t.append(1 if informal_test_d[token_id]['problematic']=='yes' else 0)
    
    # word_length
    t.append(len(lemma))
    
    # count_vowel
    t.append(sum(list(map(lemma.lower().count, 'aeiou'))))
    
    # y
    t.append(1)
    
    features.append(t)

In [None]:
df_gold_test = pd.DataFrame(features, columns=train_features_cols+y_feature)

# shuffle the dataframe
df_gold_test = df_gold_test.sample(frac=1).reset_index(drop=True)

# label encode the POS tag
df_gold_test['posMASC_le'] = label_encoder.fit_transform(df_gold_test.posMASC)

In [None]:
df_gold_test

In [None]:
df_gold_test.shape

In [None]:
clf

In [None]:
train_features

In [None]:
y_pred = clf.predict(df_gold_test[train_features])

In [None]:
print('Accuracy : ', metrics.accuracy_score(df_gold_test[y_feature], y_pred))
print('Precision : ', metrics.precision_score(df_gold_test[y_feature], y_pred))
print('Recall : ', metrics.recall_score(df_gold_test[y_feature], y_pred))
print('F-Measure : ', metrics.f1_score(df_gold_test[y_feature], y_pred))