In [1]:
!git clone https://github.com/tiasa2/Sad_Depression_Classification.git

Cloning into 'Sad_Depression_Classification'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 38 (delta 19), reused 25 (delta 11), pack-reused 0[K
Unpacking objects: 100% (38/38), done.


In [2]:
!pip install transformers==3

Collecting transformers==3
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[K     |████████████████████████████████| 754 kB 4.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 37.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 52.2 MB/s 
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 43.7 MB/s 
Installing collected packages: tokenizers, sentencepiece, sacremoses, transformers
Successfully installed sacremoses-0.0.47 sentencepiece-0.1.96 tokenizers-0.8.0rc4 transformers-3.0.0


In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, AutoTokenizer

# Use GPU
device = torch.device("cpu")

In [4]:
class BERT_Arch(nn.Module):
    def __init__(self):
      super(BERT_Arch, self).__init__()
      self.bert = AutoModel.from_pretrained('albert-base-v2') 
      self.dropout = nn.Dropout(0.1)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(768,512)
      self.fc2 = nn.Linear(512,2)
      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)
      return x

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
model = BERT_Arch()
model.to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/Sad_Depression_Classification/ALBERT_Depression_weights.pt', map_location=device))
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

In [7]:
df = pd.read_csv("./Sad_Depression_Classification/ACL_FINAL_Data.csv")
df.rename(columns = {'Text':'tweet', 'Label':'target'}, inplace = True)
df.target = df.target.astype(int)
df.head(20)

Unnamed: 0.1,Unnamed: 0,tweet,target
0,251,i feel guilty i wont be able to give this litt...,0
1,2295,There’s no point in even trying anymore. It’s ...,1
2,918,i was taught to complain and feel unhappy but ...,0
3,1990,There are probably serial killers who sleep ea...,1
4,1241,"His brown , nearly auburn hair clung to his sc...",0
5,1724,im feeling like the lunches are dull,0
6,726,i feel rotten but no amount of suggesting that...,0
7,1242,I am sad because some relations to friends are...,0
8,1132,The loss of a person I loved very much is some...,0
9,706,i forgive myself that i have accepted and allo...,0


In [8]:
def test_model(texts, MAX_LEN=25):
  tokens_p = tokenizer.batch_encode_plus(
      texts,
      add_special_tokens=True,
      max_length = MAX_LEN,
      pad_to_max_length=True,
      truncation=True,
      return_token_type_ids=False
  )
  p_seq = torch.tensor(tokens_p['input_ids'])
  p_mask = torch.tensor(tokens_p['attention_mask'])
  with torch.no_grad():
    preds = model(p_seq.to(device), p_mask.to(device))
    preds = preds.detach().cpu().numpy()
    return preds

In [9]:
def analyze(texts, labels):
  texts = texts.tolist()
  labels = labels.tolist()
  preds = test_model(texts)
  preds = np.argmax(preds, axis = 1)
  print(classification_report(labels, preds))

In [10]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df['tweet'], df['target'], 
                                                                    random_state=2018, 
                                                                    test_size=0.2, 
                                                                    stratify=df['target'])

# Using temp_text and temp_labels to create validation and test set
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.1, 
                                                                stratify=temp_labels)

In [11]:
print("Test")
analyze(test_text, test_labels)

Test
              precision    recall  f1-score   support

           0       0.87      0.85      0.86        39
           1       0.79      0.81      0.80        27

    accuracy                           0.83        66
   macro avg       0.83      0.83      0.83        66
weighted avg       0.83      0.83      0.83        66



In [12]:
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l[K     |█▏                              | 10 kB 18.9 MB/s eta 0:00:01[K     |██▍                             | 20 kB 9.7 MB/s eta 0:00:01[K     |███▋                            | 30 kB 7.9 MB/s eta 0:00:01[K     |████▊                           | 40 kB 3.7 MB/s eta 0:00:01[K     |██████                          | 51 kB 3.7 MB/s eta 0:00:01[K     |███████▏                        | 61 kB 4.4 MB/s eta 0:00:01[K     |████████▎                       | 71 kB 4.6 MB/s eta 0:00:01[K     |█████████▌                      | 81 kB 4.8 MB/s eta 0:00:01[K     |██████████▊                     | 92 kB 5.3 MB/s eta 0:00:01[K     |███████████▉                    | 102 kB 4.3 MB/s eta 0:00:01[K     |█████████████                   | 112 kB 4.3 MB/s eta 0:00:01[K     |██████████████▎                 | 122 kB 4.3 MB/s eta 0:00:01[K     |███████████████▌                | 133 kB 4.3 MB/s eta 0:00:01[K     |██████████

In [13]:
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
import time
from tqdm import trange
import torch

In [14]:
explainer = LimeTextExplainer(class_names=["label:0", "label:1"])

In [15]:
texts = test_text.tolist()
labels = test_labels.tolist()

In [16]:
d_array = []
for i in trange(len(texts)):
  exp = explainer.explain_instance(texts[i], test_model, num_features=6, labels=[0, 1], num_samples=500)
  d_array.append({"sentence":texts[i], "label:0": exp.as_list(label=0), "label:1": exp.as_list(label=1)})

100%|██████████| 66/66 [44:26<00:00, 40.40s/it]


In [17]:
torch.save(d_array, '/content/drive/MyDrive/Sad_Depression_Classification/ALBERT_Depression_XAI.pt')