In [1]:
!git clone https://github.com/tiasa2/Sad_Depression_Classification.git

fatal: destination path 'Sad_Depression_Classification' already exists and is not an empty directory.


In [2]:
!pip install transformers==3



In [3]:
# Importing libraries
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

# Use GPU
device = torch.device("cpu")

In [4]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistillBERTClass()
model.to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/Sad_Depression_Classification/DistilBert_Baseline_weights.pt', map_location=device))

<All keys matched successfully>

In [7]:
df = pd.read_csv("./Sad_Depression_Classification/ACL_FINAL_Data.csv")
df.rename(columns = {'Text':'tweet', 'Label':'target'}, inplace = True)
df.target = df.target.astype(int)
df.head(20)

Unnamed: 0.1,Unnamed: 0,tweet,target
0,251,i feel guilty i wont be able to give this litt...,0
1,2295,There’s no point in even trying anymore. It’s ...,1
2,918,i was taught to complain and feel unhappy but ...,0
3,1990,There are probably serial killers who sleep ea...,1
4,1241,"His brown , nearly auburn hair clung to his sc...",0
5,1724,im feeling like the lunches are dull,0
6,726,i feel rotten but no amount of suggesting that...,0
7,1242,I am sad because some relations to friends are...,0
8,1132,The loss of a person I loved very much is some...,0
9,706,i forgive myself that i have accepted and allo...,0


In [8]:
def test_model(texts, MAX_LEN=512):
  tokens_p = tokenizer.batch_encode_plus(
      texts,
      add_special_tokens=True,
      max_length = MAX_LEN,
      pad_to_max_length=True,
      truncation=True,
      return_token_type_ids=False
  )
  p_seq = torch.tensor(tokens_p['input_ids'])
  p_mask = torch.tensor(tokens_p['attention_mask'])
  with torch.no_grad():
    preds = model(p_seq.to(device), p_mask.to(device))
    preds = preds.detach().cpu().numpy()
    return preds

In [9]:
def analyze(texts, labels):
  texts = texts.tolist()
  labels = labels.tolist()
  preds = test_model(texts)
  preds = np.argmax(preds, axis = 1)
  print(classification_report(labels, preds))

In [10]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df['tweet'], df['target'], 
                                                                    random_state=2018, 
                                                                    test_size=0.2, 
                                                                    stratify=df['target'])

# Using temp_text and temp_labels to create validation and test set
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.1, 
                                                                stratify=temp_labels)

In [11]:
print("Test")
analyze(test_text, test_labels)

Test
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        39
           1       1.00      0.85      0.92        27

    accuracy                           0.94        66
   macro avg       0.95      0.93      0.94        66
weighted avg       0.95      0.94      0.94        66



In [12]:
!pip install lime



In [13]:
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
import time
from tqdm import trange
import torch

In [14]:
explainer = LimeTextExplainer(class_names=["label:0", "label:1"])

In [15]:
texts = test_text.tolist()
labels = test_labels.tolist()

In [16]:
d_array = []
for i in trange(len(texts)):
  exp = explainer.explain_instance(texts[i], test_model, num_features=6, labels=[0, 1], num_samples=100)
  d_array.append({"sentence":texts[i], "label:0": exp.as_list(label=0), "label:1": exp.as_list(label=1)})

100%|██████████| 66/66 [1:37:12<00:00, 88.37s/it]


In [17]:
torch.save(d_array, '/content/drive/MyDrive/Sad_Depression_Classification/DistilBert_Baseline_XAI.pt')