In [1]:
!pip install -U torchtext==0.9.0
!pip install -U captum



In [2]:
import captum

import spacy

import torch
import torchtext
from torchtext import data
from torchtext import datasets


import torch.nn as nn
import torch.nn.functional as F

from torchtext.vocab import Vocab

from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

nlp = spacy.load('en')

In [3]:
for package in (captum, spacy, torch, torchtext):
    print(package.__name__, package.__version__)

captum 0.4.0
spacy 2.2.4
torch 1.8.0
torchtext 0.9.0


In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [5]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', 
                  tokenizer_language = 'en_core_web_sm',
                  batch_first = True)
LABEL = data.LabelField(dtype = torch.float)

In [6]:
from torchtext.legacy.data import Field, Dataset,TabularDataset, BucketIterator, Iterator
train, validation, test = TabularDataset.splits(fields=[('text', TEXT), ('label', LABEL)],
                                      train='train_inc.csv',
                                      validation='val_inc.csv',
                                      test='test_inc.csv',
                                      format='CSV',
                                      # skip_header = True,
                                      path='/content/drive/MyDrive/Colab Notebooks/education project/data')

In [7]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train)

In [8]:
BATCH_SIZE = 64
train_iterator = BucketIterator(train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
valid_iterator = BucketIterator(validation, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text),
                            device=device, train=True, sort=True, sort_within_batch=True)
test_iterator = Iterator(test, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)

In [9]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [10]:

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] ## if length < max_len

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,620,801 trainable parameters


In [12]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.7358, -0.3762,  0.7767,  ...,  0.3431,  0.2294,  0.8752],
        [-0.1153, -0.1165,  1.0749,  ...,  0.8496, -0.6487,  0.1407],
        [-0.5031, -1.1681, -0.2637,  ...,  1.2091, -1.6303, -0.8331]])

In [13]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)


In [14]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [15]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [16]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [18]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [19]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/education project/models/tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 1s
	Train Loss: 0.714 | Train Acc: 54.59%
	 Val. Loss: 0.594 |  Val. Acc: 72.96%
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 0.569 | Train Acc: 70.69%
	 Val. Loss: 0.548 |  Val. Acc: 75.70%
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 0.485 | Train Acc: 77.00%
	 Val. Loss: 0.520 |  Val. Acc: 76.69%
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 0.428 | Train Acc: 80.38%
	 Val. Loss: 0.499 |  Val. Acc: 77.66%
Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 0.369 | Train Acc: 84.56%
	 Val. Loss: 0.486 |  Val. Acc: 78.51%
Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 0.310 | Train Acc: 87.22%
	 Val. Loss: 0.479 |  Val. Acc: 78.37%
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 0.256 | Train Acc: 90.91%
	 Val. Loss: 0.478 |  Val. Acc: 77.94%
Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 0.203 | Train Acc: 93.53%
	 Val. Loss: 0.511 |  Val. Acc: 76.10%
Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 0.165 | Train Acc: 95.22%
	 Val. Loss: 0.521 |  Val. Acc: 76.23%
Epoch: 10 | Epoch Time: 0m 1

In [20]:
model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/education project/models/tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.468 | Test Acc: 78.18%


In [21]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence, min_len = 5):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [22]:
print('Vocabulary Size: ', len(TEXT.vocab))

Vocabulary Size:  25002


In [23]:
PAD_IND = TEXT.vocab.stoi[TEXT.pad_token]
token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)

In [24]:
def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))

In [25]:
# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 500, label = 0):
    text = [tok.text for tok in nlp.tokenizer(sentence.lower())]
    if len(text) < min_len:
        text += [TEXT.pad_token] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid(input_indices).item()
    pred_ind = round(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                           n_steps=min_len, return_convergence_delta=True)

    # print('pred: ', Label.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    return(pred_ind,label)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes

    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

## Return significant words indicating Poor/Rich Schools that appear in the test data set

In [62]:
embeding_dic = {}
for index,row in test_data.iterrows():
    Index_test = []
    sentence = test_data.loc[index,"text"]
    text = [tok.text for tok in nlp.tokenizer(sentence.lower())]
    if len(text) < 500:
        text += [TEXT.pad_token] * (500 - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]
    model.zero_grad()
    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    reference_indices = token_reference.generate_reference(len(indexed), device=device).unsqueeze(0)
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                              n_steps=500, return_convergence_delta=True)
    attributions = attributions_ig.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()
    for i in range(500):
        try:
            embeding_dic[text[i]].append(attributions[i])
        except KeyError:
            print(text[i])
            embeding_dic[text[i]] = [attributions[i]]


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
spans
arroyo
hs
revitalizing
chart
borders
reinvesting
graders-
failed
slipped
cracks
direct
path-
relentlessly-
racial
minorities
learning-
demonstrating
gains
scoring
fcat
stickers
trackers
lined
unlined
galleries
to/
read/
setup
hardest
slip
secured
drums
classesplease
drum
covering
scales
improv
blues
assortment
woodwind
reeds
xylophone
mallets
recorder
pianos
percussion
guitars
ensemble
visits
artists
percussionist
vocalist
guitar
concerts
afro
hiphop
chorus
talented
performed
bcc
talent
concert
may
influenced
latin
choral
vandross
ensembles
rhythmic
improvisation
legs
expanding
section
antiquated
quit
accompaniment
tunes
drummers
pianists
guitarists
vulnerable
sharpeners
sharpens
walls
functional
consuming
365
assessments
recommended
supplying
versus
folders
2008
purchasing
drastically
caused
reduce
09
alls
saturday
..
bribes
cheers
rush
excitedly
claim
breath
footballs
jackets
assorted
maybe
traffic
lying
driving
cafeteria
bribe
baskets
1

In [64]:
avgDict = {}
for k,v in embeding_dic.items():
    # v is the list of grades for student k
    avgDict[k] = sum(v)/ float(len(v))
len(avgDict)

10489

In [68]:
words_imp = [(k, v) for k, v in avgDict.items()]
words_imp.sort(key=lambda x: x[1])
words_imp

[('all-', -0.7862592333608618),
 ('.your', -0.6858396910465747),
 ('9x12', -0.6068367049341133),
 ('.our', -0.5489920267365866),
 ('disabilites', -0.5410623376985121),
 ('   ', -0.47771939693790233),
 ('expeditionary', -0.44160510406448034),
 ('life-', -0.4289546258995356),
 ('fullsize', -0.42025771451755206),
 ('imovies', -0.4148094018552676),
 ('tinikling', -0.39818955434026315),
 ('.do', -0.36537111591294263),
 ('indefinitely', -0.35264673274637054),
 ('adapter', -0.3447658280048442),
 ('  ', -0.3442406548626209),
 ('-5th', -0.33366859323407294),
 ('7/8th', -0.3242465591839269),
 ('-the', -0.3165988721113854),
 ('robot', -0.30875641390468167),
 ('rainsticks', -0.2867498054719537),
 ('owns', -0.2843483149682936),
 ('bubbly', -0.27961047260907157),
 ('.an', -0.27256694179727337),
 ('firsties', -0.27119263031503854),
 ('.everyday', -0.2701659704062091),
 ('unifix', -0.2443211425278393),
 ('.we', -0.24423546637505184),
 ('.please', -0.2438538307688157),
 ('farms', -0.23728411842486336),

In [71]:
print([word[0] for word in words_imp][:100])
print([word[0] for word in words_imp][-100:])

['all-', '.your', '9x12', '.our', 'disabilites', '   ', 'expeditionary', 'life-', 'fullsize', 'imovies', 'tinikling', '.do', 'indefinitely', 'adapter', '  ', '-5th', '7/8th', '-the', 'robot', 'rainsticks', 'owns', 'bubbly', '.an', 'firsties', '.everyday', 'unifix', '.we', '.please', 'farms', 'orally', 'eleventh', 'hydrogen', 'cooperatively', 'substances', 'mics', 'analog', 'continuously', '250-student', '�', 'sensor', 'departmentalizing', 'conflict', 'dyslexic', 'asthma', 'overage', 'helix', 'encountering', 'smartboard', '4th/5th', 'anything', 'tripod', 'pulls', '.if', 'biological', 'rectangular', 'links', 'cords', 'molecular', 'digitally', 'djembes', 'electronically', 'developmentally', 'modules', 'ac', 'compression', 'tunnel', 'handheld', 'cup', 'chips', 'federal', 'consecutive', 'mounting', 'artistically', 'district', 'annotating', 'simulates', 'globes', 'regulate', 'spanning', 'reproducible', 'host', 'hotplates', 'horizons', 'photosynthesis', 'mallets', 'effectiveness', 'downloaded

In [27]:
import pandas as pd
test_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/education project/data/test_inc.csv")
test_data.head()

Unnamed: 0,text,label
0,A Spot Sit Upon !Do remember gathering rug lis...,0
1,I Can Thanks To iPadBe Change '' theme school ...,0
2,"More Personal Finance Third GradersLast year ,...",0
3,Wow ! What Is That ?With document camera added...,1
4,Developing Language Through PlayPlaying dinosa...,0


In [72]:
result = []
for i in range(test_data.shape[0]):
  x = interpret_sentence(model, test_data.loc[i,"text"], label=test_data.loc[i,"label"])
  result.append(x)

Classify rich and poor 
what explains - linear probability model
interpretability - top embedings

In [73]:
# print('Visualize attributions based on Integrated Gradients')
# _ = visualization.visualize_text(vis_data_records_ig[60:80])

In [74]:
from sklearn.datasets import make_circles
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

predict = []
actual = []
for i in result:
  predict.append(i[0])
  actual.append(i[1])

accuracy = accuracy_score(actual, predict)
print('Accuracy: %f' % accuracy)
precision = precision_score(actual, predict)
print('Precision: %f' % precision)
recall = recall_score(actual, predict)
print('Recall: %f' % recall)
f1 = f1_score(actual, predict)
print('F1 score: %f' % f1)
matrix = confusion_matrix(actual, predict)
print(matrix)

Accuracy: 0.738095
Precision: 0.785942
Recall: 0.692958
F1 score: 0.736527
[[250  67]
 [109 246]]
