This notebook explores the use of the permutation test to assess the significance of coefficents learned in logistic regression (testing against the null that each $\beta$ = 0).

In [1]:
import sys
from sklearn import preprocessing
from sklearn import linear_model
from random import choices
from sklearn.feature_extraction.text import CountVectorizer
from random import shuffle
import numpy as np
import copy

In [2]:
def read_data(filename):
    X=[]
    Y=[]
    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            label=cols[0]
            text=cols[1]
            # assumes text is already tokenized
            X.append(text)
            Y.append(label)
    return X, Y

In [3]:
# Change this to the directory with your data (from the CheckData_TODO.ipynb exercise).  
# The directory should contain train.tsv, dev.tsv and test.tsv
directory="../data/lmrd"

In [4]:
trainX, trainY=read_data("%s/train.tsv" % directory)
devX, devY=read_data("%s/dev.tsv" % directory)

In [5]:
def featurize(trainX, devX):
    vectorizer = CountVectorizer(max_features=10000, analyzer=str.split, lowercase=False, strip_accents=None, binary=True)

    X_train = vectorizer.fit_transform(trainX)
    X_dev = vectorizer.transform(devX)

    return X_train, X_dev, vectorizer

In [6]:
def train(X_train, trainY, le):
    Y_train=le.transform(trainY)
    logreg = linear_model.LogisticRegression(C=100, solver='lbfgs', penalty='l2', max_iter=10000)
    logreg.fit(X_train, Y_train)
    return logreg
    return logreg.coef_[0]

In [7]:
def test(logreg, devX_feat, devY, le):
    Y_dev=le.transform(devY)
    print("Accuracy: %.3f" % logreg.score(devX_feat, Y_dev))

In [8]:
def analyze_weights(coefs, label_encoder, vocab, p_values):
    reverse_vocab = {v: k for k, v in vocab.items()}

    sort_index = np.argsort(coefs)

    print(label_encoder.inverse_transform([0])[0])
    for k in sort_index[:25]:
        print ("%.5f\t%s\t%.4f" % (coefs[k], reverse_vocab[k], p_values[k] ))

    print(label_encoder.inverse_transform([1])[0])

    for k in reversed(sort_index[-25:]):
        print ("%.5f\t%s\t%.4f" % (coefs[k], reverse_vocab[k], p_values[k] ))

In [None]:
X_train, X_dev, vectorizer=featurize(trainX, devX)
le = preprocessing.LabelEncoder()
le.fit(trainY)

logreg=train(X_train, trainY, le)
test(logreg, X_dev, devY, le)

true_coefficients=logreg.coef_[0]

# We'll set P=100 here to finish running in class, but set higher (e.g., 10000) for real applications
P=100

p_values=np.zeros(len(true_coefficients))
permutedY=copy.deepcopy(trainY)

for i in range(P):
    if i % 10 == 0:
        print(i)
    
    # permute the values of Y so that they're now attached to random data points in X
    shuffle(permutedY)
    
    # train logistic regression on that permuted dataset
    permuted_logreg=train(X_train, permutedY, le)
    coefficients=permuted_logreg.coef_[0]
    
    # test how often the coefficients learned from the permuted data are as extreme as
    # the coefficients from the true data
    for idx, coef in enumerate(coefficients):
        if abs(true_coefficients[idx]) < abs(coef):
            p_values[idx]+=1./P

Accuracy: 0.863
0
10
20
30
40
50
60
70
80
90


In [None]:
inverse_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}            
out=open("weights.txt", "w")
for idx, coef in enumerate(true_coefficients):
    out.write("%.3f\t%s\t%.5f\n" % (coef, inverse_vocab[idx], p_values[idx]))
out.close()

In [11]:
analyze_weights(true_coefficients, le, vectorizer.vocabulary_, p_values)

D
-6.76251	remainder	0.0300
-6.55355	request	0.0000
-5.96585	quorum	0.0400
-5.48654	constitutional	0.1000
-5.27949	please	0.0300
-5.24354	rohrabacher	0.0800
-5.14597	objection	0.0100
-5.01229	briefly	0.0400
-4.92054	wish	0.0100
-4.81034	present	0.0000
-4.76935	entitled	0.0900
-4.62166	covers	0.1600
-4.59619	two	0.0500
-4.39108	leader	0.0200
-4.38605	michigan	0.1300
-4.27747	answering	0.2200
-4.22751	permission	0.0200
-4.09117	reserved	0.2800
-4.07308	although	0.2100
-4.06503	--	0.1100
-4.00915	offer	0.0400
-3.94305	sir	0.2400
-3.86242	looks	0.2200
-3.84841	occurred	0.1400
-3.80051	clerk	0.1700
R
8.55687	refuse	0.0000
6.96132	five	0.0000
6.17567	science	0.0200
6.13398	respond	0.0100
5.67848	left	0.0100
5.65184	confident	0.0700
5.61088	rhetorical	0.0500
5.58951	table	0.0900
5.08841	helping	0.0100
5.07008	judiciary	0.1100
4.97629	detained	0.0300
4.75859	whose	0.0600
4.61480	intent	0.0500
4.35216	prepared	0.1100
4.28200	response	0.0500
4.27023	..	0.2000
4.22933	asked	0.0900
4.11005	rose	0.