In [1]:
import torch
import spacy
import gensim
import numpy as np
import pandas as pd
from sklearn import model_selection
from mlxtend.evaluate import permutation_test
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

PAD = 100
DIM = 300

TEST_PATH = "dreaddit/dreaddit-test.csv"
TRAIN_PATH = "dreaddit/dreaddit-train.csv"
W2V_PATH = "GoogleNews-vectors-negative300.bin"

In [2]:
nlp = spacy.load("en_core_web_lg")
w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

def vectorize(token):
    if token.text in w2v:
        return w2v[token.text]
    return np.random.rand(DIM)

In [19]:
def process(PATH, features_to_test=[], include_subreddit=False, include_position=False, 
            include_length=False):
    df = pd.read_csv(PATH)
    
    if include_subreddit == 'discrete':
        one_hot_subreddit = pd.get_dummies(df['subreddit'])
        df = df.merge(one_hot_subreddit, left_index=True, right_index=True)
        features_to_test += list(one_hot_subreddit)
        
    if include_subreddit == 'meaning':
        df['text'] = df['subreddit'] + ' ' + df['text']
    
    if include_position:
        df['sentence_range'] = df['sentence_range'].apply(lambda sent_range:
                                                          int(sent_range.split()[0][1:-1]))
        features_to_test.append('sentence_range')
        
    if include_length:
        df['text_length'] = df['text'].apply(lambda text: len(text))
        features_to_test.append('text_length')

    df['tokenized'] = df['text'].apply(nlp)

    vectorizer = lambda sent: np.mean(np.array([vectorize(token) for token in sent]), 0)
    df['vectorized'] = df['tokenized'].apply(vectorizer)
        
    X = np.empty((0, DIM))
    for x in df['vectorized']:
        X = np.concatenate((X, np.expand_dims(x, axis=0)))
    y = df['label'].to_numpy()
    
    base_features = ['lex_liwc_Clout', 'lex_liwc_i', 'lex_liwc_Tone']
    features = base_features + features_to_test
    feature_cols = np.empty((len(df), 0))
    for feature in features:
        feature_col = np.transpose(np.expand_dims(df[feature].to_numpy(), axis=0))
        feature_cols = np.concatenate((feature_cols, feature_col), axis=1)
    X = np.concatenate((X, feature_cols), axis=1)
    
    return X, y

In [20]:
def evaluate(clf, X, y):
    scoring = {'pre' : make_scorer(precision_score), 
               'rec' : make_scorer(recall_score), 
               'f1' : make_scorer(f1_score)}

    kfold = model_selection.KFold(n_splits=10, shuffle=True)

    results = model_selection.cross_validate(estimator=clf, X=X, y=y, cv=kfold, scoring=scoring)

    print('{} Pre.: {}, Rec.: {}, F1: {}'.format(clf,
                                              np.mean(results['test_pre']), 
                                              np.mean(results['test_rec']), 
                                              np.mean(results['test_f1'])))

    return np.mean(results['test_f1'])

In [21]:
def grid(X, y):
    best = (0, 0)
    for c in [0.0001, 0.001, 0.01, 0.1, 1, 2, 4, 8, 16, 32, 64]:
        best = max(best, (evaluate(LogisticRegression(C=c), X, y), c))
    return best

In [23]:
def train_and_run(features_to_test=[], include_subreddit=False, include_position=False, include_length=False):
    
    X_train, y_train = process(TRAIN_PATH, features_to_test.copy(), include_subreddit, include_position, 
                               include_length)
    clf = LogisticRegression(C=0.01).fit(X_train, y_train)
    X_test, y_test = process(TEST_PATH, features_to_test.copy(), include_subreddit, include_position, 
                             include_length)
    
    if len(features_to_test) > 0:
        print("Additional features:", features_to_test)
    if include_subreddit:
        print("Include subreddit:", include_subreddit)
    if include_position:
        print("Include position:", include_position)
    if include_length:
        print("Include length:", include_length)
          
    y_pred = clf.predict(X_test)
    print('F1:', f1_score(y_test, y_pred))
    
    return y_pred

In [24]:
baseline_pred = train_and_run()
discrete_subreddit_pred = train_and_run(include_subreddit='discrete')
meaning_subreddit_pred = train_and_run(include_subreddit='meaning')

p_value_discrete = permutation_test(baseline_pred, discrete_subreddit_pred, method='approximate')
p_value_meaning = permutation_test(baseline_pred, meaning_subreddit_pred, method='approximate')

print("Discrete subreddit addition p-value:", p_value_discrete)
print("Subreddit meaning addition p-value:", p_value_meaning)

F1: 0.770012706480305


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Include subreddit: discrete
F1: 0.7694267515923566
Include subreddit: meaning
F1: 0.7668789808917197
Discrete subreddit addition p-value: 0.9470529470529471
Subreddit meaning addition p-value: 0.955044955044955


In [25]:
sentiment_pred = train_and_run(['sentiment'])
p_value_sentiment = permutation_test(baseline_pred, sentiment_pred, method='approximate')
print("Sentiment addition p-value:", p_value_sentiment)

Additional features: ['sentiment']
F1: 0.7694267515923566
Sentiment addition p-value: 0.961038961038961


In [26]:
position_pred = train_and_run(include_position=True)
p_value_position = permutation_test(baseline_pred, position_pred, method='approximate')
print("Position addition p-value:", p_value_position)

Include position: True
F1: 0.770012706480305
Position addition p-value: 1.0


In [27]:
length_pred = train_and_run(include_length=True)
p_value_length = permutation_test(baseline_pred, length_pred, method='approximate')
print("Length addition p-value:", p_value_length)

Include length: True
F1: 0.7668789808917197
Length addition p-value: 0.9590409590409591


In [28]:
combination_pred = train_and_run(['sentiment'], include_subreddit=True, include_position=True, 
                                 include_length=True)
p_value_combination = permutation_test(baseline_pred, combination_pred, method='approximate')
print("Combination addition p-value:", p_value_length)

Additional features: ['sentiment']
Include subreddit: True
Include position: True
Include length: True
F1: 0.7719745222929936
Combination addition p-value: 0.9590409590409591
