# XLNET_TWEETS

In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


## 1.2. Installing the Hugging Face Library


In [3]:
!pip install transformers



# 2. Loading Dataset

In [4]:
# from google.colab import files
# uploaded = files.upload()

In [5]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("Final_data.csv", )

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Number of training sentences: 141,071



Unnamed: 0.1,Unnamed: 0,Tweets,Subjectivity,Polarity,Analysis
70888,70888,Bengal was corona free. Till an Oxford brat c...,0.4,0.2,1
25789,25789,Anyone who would like to make a donation to N...,0.0,0.0,0
21438,21438,Sigh,0.0,0.0,0
60707,60707,Can everyone just stay at home please so that...,0.5,0.366667,1
55259,55259,As invites citizen-participation in the figh...,0.0,0.0,0
42574,42574,This is for CORONA you big fat white nasty s...,0.366667,-0.333333,-1
103179,103179,ciara Me once I touch down on my first trip po...,0.311111,0.047222,1
128246,128246,"Dlaminii Not Pulp Fiction, so toilet paper is ...",0.25,-0.266667,-1
132877,132877,funny thing is im watching on vrv,1.0,0.25,1
90461,90461,Corona Dancing Instructions,0.0,0.0,0


In [6]:
df.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

In [7]:
sentences  = []
for sentence in df['Tweets']:
  sentence = sentence+"[SEP] [CLS]"
  sentences.append(sentence)

In [8]:
sentences[0]

'Wuhan has been in complete quarantine for over 8 weeks. People here are still going about their daily lives as normal. This is pure fantasy. [SEP] [CLS]'

In [9]:
labels=df['Analysis'].values

In [11]:
# # Get the lists of sentences and their labels.
# sentences = df.sentence.values
# labels = df.label.values

##IMPORTING DEPENDENCIES

In [12]:
import transformers
from transformers import XLNetTokenizer, XLNetModel, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from collections import defaultdict
from textwrap import wrap
from pylab import rcParams

from torch import nn, optim
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

  import pandas.util.testing as tm
Using TensorFlow backend.


# 3. Tokenization & Input Formatting



## XLNET Tokenizer

In [13]:
from transformers import XLNetTokenizer

# Load the XLNet tokenizer.
print('Loading XLNet tokenizer...')
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',
                                           do_lower_case=True)


Loading XLNet tokenizer...


In [14]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]
ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]

***maximum length of our sentences***

In [15]:
print('Max sentence length: ', max([len(sen) for sen in ids]))
MAX_LEN=155

Max sentence length:  155


In [16]:
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 170

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_ids2 = pad_sequences(ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')


Padding/truncating all sentences to 170 values...

Padding token: "<pad>", ID: 5

Done.


In [17]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
xtrain,xtest,ytrain,ytest= train_test_split(input_ids2, labels, 
                                                    random_state=2018, 
                                                   test_size=0.15)

In [18]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
Xtrain = torch.tensor(xtrain)
Ytrain = torch.tensor(ytrain)
Xtest = torch.tensor(xtest)
Ytest = torch.tensor(ytest)

In [19]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning XLNET on a specific task, the authors recommend a batch size of
# 48.

batch_size = 4

# Create the DataLoader for our training set.
train_data = TensorDataset(Xtrain,Ytrain)
loader = DataLoader(train_data,batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(Xtest,Ytest)
test_loader = DataLoader(test_data,batch_size=batch_size)

In [20]:
from transformers import XLNetForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = XLNetForSequenceClassification.from_pretrained(
    "xlnet-base-cased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   

)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [21]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


## Training the model

In [22]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss()

In [23]:
import numpy as np
def flat_accuracy(preds,labels):  # A function to predict Accuracy
  correct=0
  for i in range(0,len(labels)):
    if(preds[i]==labels[i]):
      correct+=1
  return (correct/len(labels))*100

In [24]:
no_train=0
epochs = 1
for epoch in range(epochs):
  print("TRAINING EPOCH ",epoch)
  model.train()
  loss1 = []
  steps = 0
  train_loss = []
  l = []
  for inputs,labels1 in loader :
    inputs.to(device)
    labels1.to(device)
    optimizer.zero_grad()
    outputs = model(inputs.to(device))
    loss = criterion(outputs[0],labels1.to(device)).to(device)
    # logits = outputs[1]
    #ll=outp(loss)
    [train_loss.append(p.item()) for p in torch.argmax(outputs[0],axis=1).flatten() ]#our predicted 
    [l.append(z.item()) for z in labels1]# real labels
    loss.backward()
    optimizer.step()
    loss1.append(loss.item())
    no_train += inputs.size(0)
    steps += 1
  print("Current Loss is : {} Step is : {} number of Example : {} Accuracy : {}".format(loss.item(),epoch,no_train,flat_accuracy(train_loss,l)))

TRAINING EPOCH  0
Current Loss is : 0.0257415771484375 Step is : 0 number of Example : 119910 Accuracy : 94.25652572762905
