In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%cd drive/My\ Drive/DL/assgn2
!ls

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
from string import punctuation
from collections import Counter
import torch
from torch.utils.data import DataLoader,TensorDataset
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
import pickle
import re
import random
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
random.seed(2020)

In [0]:
def pad_tweets(tweets_int,seq_length):
  features=np.zeros((len(tweets_int),seq_length),dtype=int)
  for i,tweet in enumerate(tweets_int):
    tweet_len=len(tweet)
    if tweet_len<=seq_length:
      zeros=list(np.zeros(seq_length-tweet_len))
      padded_tweet=tweet+zeros
    
    features[i,:]=np.array(padded_tweet)
  return features

def train(model,iterator,optimiser,criterion):
  epoch_loss=0
  epoch_accuracy=0
  model.train()
  for i,batch in enumerate(iterator):
    x=batch[0]
    y=batch[1]
    optimiser.zero_grad()
    pred=model(x)
    y=y.cpu().detach().numpy()
    y=np.eye(13)[y]
    y=torch.from_numpy(y).float().cuda()
    loss=criterion(pred,y)
    acc=accuracy(pred,y)
    loss.backward()
    optimiser.step()
    epoch_loss+=loss.item()
    epoch_accuracy+=acc
  return epoch_loss/len(iterator),epoch_accuracy/len(iterator)

def evaluate(model,iterator,criterion):
  model.eval()
  epoch_loss=0
  epoch_accuracy=0
  with torch.no_grad():
    for i,batch in enumerate(iterator):
      x=batch[0]
      y=batch[1]
      pred=model(x)
      y=y.cpu().detach().numpy()
      y=np.eye(13)[y]
      y=torch.from_numpy(y).float().cuda()
      loss=criterion(pred,y)
      acc=accuracy(pred,y)
      epoch_loss+=loss.item()
      epoch_accuracy+=acc
  return epoch_loss/len(iterator),epoch_accuracy/len(iterator)

def accuracy(pred,y):
  count=0
  for i in range(len(pred)):
    idx=torch.argmax(pred[i])
    idx_class=torch.argmax(y[i])
    if idx.item()==idx_class.item():
      count+=1
  return count/len(y)

def roc(gt,pred):
  classes=list(set(gt))
  gt=label_binarize(gt,classes)
  pred=label_binarize(pred,classes)
  plt.figure()
  fpr=dict()
  tpr=dict()
  roc_auc=dict()
  n_classes=len(classes)
  mean_fpr=np.linspace(0,1,100)
  tprs=[]
  aucs=[]
  for i in range(n_classes):
    fpr[i],tpr[i],_=roc_curve(gt[:,i],pred[:,i])
    roc_auc[i]=auc(fpr[i],tpr[i])
    aucs.append(roc_auc[i])
    tprs.append(np.interp(mean_fpr,fpr[i],tpr[i]))
    tprs[-1][0]=0.0
    plt.plot(fpr[i],tpr[i],lw=1,alpha=0.3)
  mean_tpr=np.mean(tprs,axis=0)
  mean_tpr[-1]=1.0
  mean_auc=auc(mean_fpr,mean_tpr)
  std_auc=np.std(aucs)
  plt.plot(mean_fpr,mean_tpr,color='b',label=r'Mean ROC (AUC=%0.2f $\pm$ %0.2f)'%(mean_auc,std_auc),lw=2,alpha=0.8)
  std_tpr=np.std(tprs,axis=0)
  tprs_upper=np.minimum(mean_tpr+std_tpr,1)
  tprs_lower=np.maximum(mean_tpr-std_tpr,0)
  plt.fill_between(mean_fpr,tprs_lower,tprs_upper,color='grey',alpha=0.2,label=r'$\pm$ 1 std. dev.')

  plt.plot([0,1],[0,1],color='navy',alpha=0.8,lw=2,linestyle='--')
  plt.xlim([0.0,1.0])
  plt.ylim([0.0,1.05])
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('ROC curve')
  plt.legend(loc='lower right')
  plt.show()

In [0]:
data=pd.read_csv('train_data.csv')
data.head()

In [0]:
tweets=data.iloc[:,1].values
labels=data.iloc[:,0].values
print(tweets.shape,labels.shape)
print(tweets)

In [0]:
for i in range(len(tweets)):
  tweet=tweets[i]
  r=re.findall("@[\w]* ",tweet)
  for j in r:
    tweets[i]=re.sub(j,'',tweet)
print(tweets)

In [0]:
for i in range(len(tweets)):
  tweets[i]=tweets[i].lower()
  tweets[i]=''.join([c for c in tweets[i] if c not in punctuation])
print(tweets)

In [0]:
text=' '.join(tweets)
words=text.split()
counts_words=Counter(words)
n_words=len(words)
x_sorted_words=counts_words.most_common(n_words)
print(counts_words)

text=' '.join(labels)
words=text.split()
counts_words=Counter(words)
n_words=len(words)
y_sorted_words=counts_words.most_common(n_words)
print(counts_words)

In [0]:
tweets_vocab={w:i+1 for i,(w,c) in enumerate(x_sorted_words)}
tweets_vocab['<unk>']=len(tweets_vocab)
print(tweets_vocab)
labels_vocab={w:i for i,(w,c) in enumerate(y_sorted_words)}
print(labels_vocab)

In [0]:
tweets_tokenise=[]
for tweet in tweets:
    t=[]
    for word in tweet.split():
      t.append(tweets_vocab[word])
    tweets_tokenise.append(t)
print(tweets_tokenise[0:3])

In [0]:
labels_tokenise=[]
for i in range(len(labels)):
  labels_tokenise.append(labels_vocab[labels[i]])
labels_tokenise=np.array(labels_tokenise)
print(labels_tokenise)
print(labels_tokenise.shape)

In [0]:
tweets_len=[len(tweet) for tweet in tweets_tokenise]
max_len=max(tweets_len)

In [0]:
tweets_tokenise=pad_tweets(tweets_tokenise,max_len)
print(tweets_tokenise)
print(tweets_tokenise.shape)

In [0]:
train_test_split=0.8
x_train=tweets_tokenise[0:int(train_test_split*tweets_tokenise.shape[0])]
y_train=labels_tokenise[0:int(train_test_split*labels_tokenise.shape[0])]

x_val=tweets_tokenise[int(train_test_split*tweets_tokenise.shape[0]):]
y_val=labels_tokenise[int(train_test_split*labels_tokenise.shape[0]):]

print(x_train.shape,y_train.shape,x_val.shape,y_val.shape)

In [0]:
train_df=TensorDataset(torch.from_numpy(x_train).long().cuda(),torch.from_numpy(y_train).long().cuda())
val_df=TensorDataset(torch.from_numpy(x_val).long().cuda(),torch.from_numpy(y_val).long().cuda())

batch_size=50
train_loader=DataLoader(train_df,shuffle=True,batch_size=batch_size)
val_loader=DataLoader(val_df,shuffle=True,batch_size=batch_size)

In [0]:
class Attention(nn.Module):
  def __init__(self,encoder_hidden_dim,decoder_hidden_dim):
    super(Attention,self).__init__()
    self.encoder_hidden_dim=encoder_hidden_dim
    self.decoder_hidden_dim=decoder_hidden_dim
    self.attn=nn.Linear((encoder_hidden_dim*2)+decoder_hidden_dim,decoder_hidden_dim)
    self.v=nn.Linear(decoder_hidden_dim,1,bias=False)
  
  def forward(self,encoder_outputs,hidden):
    hidden=hidden.unsqueeze(1).repeat(1,max_len,1)
    x=torch.cat((hidden,encoder_outputs),dim=2)
    x=self.attn(x)
    x=torch.tanh(x)
    attention=self.v(x).squeeze(2)
    attention=F.softmax(attention,dim=1)
    return attention

class RNNSentimentAnalysis(nn.Module):
  def __init__(self,batch_size,output_dim,encoder_hidden_dim,decoder_hidden_dim,vocab_size,embedding_dim,dropout_p):
    super(RNNSentimentAnalysis,self).__init__()
    self.batch_size=batch_size
    self.output_dim=output_dim
    self.encoder_hidden_dim=encoder_hidden_dim
    self.decoder_hidden_dim=decoder_hidden_dim
    self.vocab_size=vocab_size
    self.embedding_dim=embedding_dim
    self.dropout_p=dropout_p

    self.embedding=nn.Embedding(vocab_size,embedding_dim)
    self.enc_rnn=nn.GRU(embedding_dim,encoder_hidden_dim,bidirectional=True)
    self.enc_fc=nn.Linear(encoder_hidden_dim*2,decoder_hidden_dim)
    self.attention=Attention(encoder_hidden_dim,decoder_hidden_dim)
    self.dec_rnn=nn.GRU(encoder_hidden_dim*2,decoder_hidden_dim)
    self.dropout=nn.Dropout(self.dropout_p)
    self.dec_fc=nn.Linear((encoder_hidden_dim*2)+decoder_hidden_dim,output_dim)
  
  def forward(self,x):
    #encoder
    embed=self.embedding(x)
    embed=embed.permute(1,0,2)
    enc_output,enc_hidden=self.enc_rnn(embed)
    enc_hidden=torch.cat((enc_hidden[0,:,:],enc_hidden[1,:,:]),dim=1)
    enc_hidden=self.enc_fc(enc_hidden)
    enc_hidden=torch.tanh(enc_hidden)
    
    #attention
    enc_output=enc_output.permute(1,0,2)
    attn=self.attention(enc_output,enc_hidden)
    
    #decoder
    attn=attn.unsqueeze(1)
    weighted_scores=torch.bmm(attn,enc_output)
    weighted_scores=weighted_scores.permute(1,0,2)
    dec_output,dec_hidden=self.dec_rnn(weighted_scores,enc_hidden.unsqueeze(0))
    dec_output=dec_output.squeeze(0)
    weighted_scores=weighted_scores.squeeze(0)
    x=torch.cat((dec_output,weighted_scores),dim=1)
    x=self.dropout(x)
    x=self.dec_fc(x)
    x=F.softmax(x,dim=1)
    return x

class LSTMSentimentAnalysis(nn.Module):
  def __init__(self,batch_size,output_dim,encoder_hidden_dim,decoder_hidden_dim,vocab_size,embedding_dim,dropout_p):
    super(LSTMSentimentAnalysis,self).__init__()
    self.batch_size=batch_size
    self.output_dim=output_dim
    self.encoder_hidden_dim=encoder_hidden_dim
    self.decoder_hidden_dim=decoder_hidden_dim
    self.vocab_size=vocab_size
    self.embedding_dim=embedding_dim
    self.dropout_p=dropout_p
    
    self.embedding=nn.Embedding(vocab_size,embedding_dim)
    self.enc_lstm=nn.LSTM(embedding_dim,encoder_hidden_dim,bidirectional=True)
    self.enc_dropout=nn.Dropout(self.dropout_p)
    self.enc_fc=nn.Linear(encoder_hidden_dim*2,decoder_hidden_dim)
    self.attention=Attention(encoder_hidden_dim,decoder_hidden_dim)
    self.dec_lstm=nn.LSTM(encoder_hidden_dim*2,decoder_hidden_dim)
    self.dec_dropout=nn.Dropout(self.dropout_p)
    self.dec_fc=nn.Linear((encoder_hidden_dim*2)+decoder_hidden_dim,output_dim)
  
  def forward(self,x):
    #encoder
    embed=self.embedding(x)
    embed=embed.permute(1,0,2)
    enc_output,(enc_hidden,enc_cell_state)=self.enc_lstm(embed)
    enc_hidden=torch.cat((enc_hidden[0,:,:],enc_hidden[1,:,:]),dim=1)
    enc_hidden=self.enc_dropout(enc_hidden)
    enc_hidden=self.enc_fc(enc_hidden)
    enc_hidden=torch.tanh(enc_hidden)

    #attention
    enc_output=enc_output.permute(1,0,2)
    attn=self.attention(enc_output,enc_hidden)

    #decoder
    attn=attn.unsqueeze(1)
    weighted_scores=torch.bmm(attn,enc_output)
    weighted_scores=weighted_scores.permute(1,0,2)
    dec_output,(dec_hidden,dec_cell_state)=self.dec_lstm(weighted_scores,(enc_hidden.unsqueeze(0),torch.zeros_like(enc_cell_state[0,:,:].unsqueeze(0))))
    dec_output=dec_output.squeeze(0)
    weighted_scores=weighted_scores.squeeze(0)
    x=torch.cat((dec_output,weighted_scores),dim=1)
    x=self.dec_dropout(x)
    x=self.dec_fc(x)
    x=F.softmax(x,dim=1)
    return x

In [0]:
input_dim=len(tweets_vocab)+1
output_dim=len(labels_vocab)
embedding_dim=50

In [0]:
with open('glove.twitter.27B.50d.pkl','rb') as f:
  embedding_dict=pickle.load(f)

embedding_weights=np.zeros((input_dim,embedding_dim))
words_found=0
for i,word in enumerate(tweets_vocab):
  try: 
    embedding_weights[i]=embedding_dict[word]
    words_found+=1
  except KeyError:
    embedding_weights[i]=np.random.normal(scale=0.6,size=(embedding_dim,))
embedding_weights=torch.from_numpy(embedding_weights).to(device)

# model=RNNSentimentAnalysis(batch_size,output_dim,64,64,input_dim,embedding_dim,dropout_p=0.8)
model=LSTMSentimentAnalysis(batch_size,output_dim,128,128,input_dim,embedding_dim,dropout_p=0.8)
model.embedding.load_state_dict({'weight':embedding_weights})
print(model)
model.cuda()
total_params=sum(p.numel() for p in model.parameters() if p.requires_grad)
print('total_params:',total_params)

In [0]:
optimiser=torch.optim.Adam(model.parameters())
criterion=nn.BCELoss()
epochs=4

In [0]:
train_loss_list=[]
train_acc_list=[]
val_loss_list=[]
val_acc_list=[]
for epoch in range(epochs):
  train_iterator=iter(train_loader)
  val_iterator=iter(val_loader)
  train_loss,train_acc=train(model,train_iterator,optimiser,criterion)
  val_loss,val_acc=evaluate(model,val_iterator,criterion)

  train_loss_list.append(train_loss)
  train_acc_list.append(train_acc)
  val_loss_list.append(val_loss)
  val_acc_list.append(val_acc)
  print('Epoch ',epoch+1,'/',epochs,' loss:',train_loss,' acc:',train_acc,' val_loss:',val_loss,' val_acc:',val_acc)

In [0]:
plt.figure()
plt.title('loss vs epochs')
plt.plot(train_loss_list,label='train')
plt.plot(val_loss_list,label='val')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(loc='upper right')

plt.figure()
plt.title('acc vs epochs')
plt.plot(train_acc_list,label='train')
plt.plot(val_acc_list,label='val')
plt.xlabel('epochs')
plt.ylabel('acc')
plt.legend(loc='upper left')

In [0]:
if isinstance(model,RNNSentimentAnalysis):
  torch.save(model.state_dict(),'/content/drive/My Drive/DL/assgn2/rnn.pt')
else:
  torch.save(model.state_dict(),'/content/drive/My Drive/DL/assgn2/lstm.pt')

In [0]:
# model=RNNSentimentAnalysis(batch_size,output_dim,64,64,input_dim,embedding_dim,dropout_p=0.8)
model=LSTMSentimentAnalysis(batch_size,output_dim,128,128,input_dim,embedding_dim,dropout_p=0.8)
model.cuda()
if isinstance(model,RNNSentimentAnalysis):
  model.load_state_dict(torch.load('/content/drive/My Drive/DL/assgn2/rnn.pt'))
else:
  model.load_state_dict(torch.load('/content/drive/My Drive/DL/assgn2/lstm.pt'))

train_preds=[]
train_gt=[]
train_iterator=iter(train_loader)
model.eval()
with torch.no_grad():
  for i,batch in enumerate(train_iterator):
    x=batch[0]
    y=batch[1]
    pred=model(x)
    for j in range(pred.size(0)):
      pred_i=torch.argmax(pred[j]).cpu().detach().numpy()
      train_preds.append(pred_i)
      gt_i=y[j].cpu().detach().numpy()
      train_gt.append(gt_i)
train_preds=np.array(train_preds)
train_gt=np.array(train_gt)
count=0
for i in range(len(train_preds)):
  if train_preds[i]==train_gt[i]:
    count+=1
print(count/len(train_preds))
train_cm=confusion_matrix(train_gt,train_preds)
print('Train Confusion Matrix:')
print(train_cm)
roc(train_gt,train_preds)
print()

val_preds=[]
val_gt=[]
val_iterator=iter(val_loader)
model.eval()
with torch.no_grad():
  for i,batch in enumerate(val_iterator):
    x=batch[0]
    y=batch[1]
    pred=model(x)
    for j in range(pred.size(0)):
      pred_i=torch.argmax(pred[j]).cpu().detach().numpy()
      val_preds.append(pred_i)
      gt_i=y[j].cpu().detach().numpy()
      val_gt.append(gt_i)
val_preds=np.array(val_preds)
val_gt=np.array(val_gt)
count=0
for i in range(len(val_preds)):
  if val_preds[i]==val_gt[i]:
    count+=1
print(count/len(val_preds))
val_cm=confusion_matrix(val_gt,val_preds)
print('Validation Confusion Matrix:')
print(val_cm)
roc(val_gt,val_preds)

Test Code

In [0]:
test_data=pd.read_csv('test_data.csv')
test_data.head()

In [0]:
tweets=test_data.iloc[:,1].values
labels=test_data.iloc[:,0].values
print(tweets.shape,labels.shape)
print(tweets)

In [0]:
for i in range(len(tweets)):
  tweet=tweets[i]
  r=re.findall("@[\w]* ",tweet)
  for j in r:
    tweets[i]=re.sub(j,'',tweet)
print(tweets)

In [0]:
for i in range(len(tweets)):
  tweets[i]=tweets[i].lower()
  tweets[i]=''.join([c for c in tweets[i] if c not in punctuation])
print(tweets)

In [0]:
tweets_tokenise=[]
for tweet in tweets:
  t=[]
  for word in tweet.split():
    try:
      t.append(tweets_vocab[word])
    except KeyError:
      t.append(len(tweets_vocab))
  tweets_tokenise.append(t)
print(tweets_tokenise[0:3])

In [0]:
labels_tokenise=[]
for i in range(len(labels)):
  labels_tokenise.append(labels_vocab[labels[i]])
labels_tokenise=np.array(labels_tokenise)
print(labels_tokenise)
print(labels_tokenise.shape)

In [0]:
tweets_tokenise=pad_tweets(tweets_tokenise,max_len)
print(tweets_tokenise)
print(tweets_tokenise.shape)

In [0]:
x_test=tweets_tokenise[:]
y_test=labels_tokenise[:]

print(x_test.shape,y_test.shape)

In [0]:
test_df=TensorDataset(torch.from_numpy(x_test).long().cuda(),torch.from_numpy(y_test).long().cuda())
batch_size=50
test_loader=DataLoader(test_df,shuffle=True,batch_size=batch_size)

In [0]:
# model=RNNSentimentAnalysis(batch_size,13,64,64,input_dim,embedding_dim,0.8)
model=LSTMSentimentAnalysis(batch_size,13,128,128,input_dim,embedding_dim,0.8)
model.cuda()
if isinstance(model,RNNSentimentAnalysis):
  model.load_state_dict(torch.load('/content/drive/My Drive/DL/assgn2/rnn.pt'))
else:
  model.load_state_dict(torch.load('/content/drive/My Drive/DL/assgn2/lstm.pt'))

test_preds=[]
test_gt=[]
test_iterator=iter(test_loader)
model.eval()
with torch.no_grad():
  for i,batch in enumerate(test_iterator):
    x=batch[0]
    y=batch[1]
    pred=model(x)
    y=y.cpu().detach().numpy()
    y=np.eye(13)[y]
    y=torch.from_numpy(y).float().cuda()
    for j in range(pred.size(0)):
      test_preds.append(pred[j].cpu().detach().numpy())
      test_gt.append(y[j].cpu().detach().numpy())
test_preds=np.array(test_preds)
test_gt=np.array(test_gt)
test_acc=accuracy(torch.from_numpy(test_preds).float().cuda(),torch.from_numpy(test_gt).float().cuda())
print('Accuracy:',test_acc*100,'%')

preds=[]
gt=[]
for i in range(test_preds.shape[0]):
  pred_i=test_preds[i]
  gt_i=test_gt[i]
  preds.append(np.argmax(pred_i))
  gt.append(np.argmax(gt_i))

gt=np.array(gt)
preds=np.array(preds)
test_cm=confusion_matrix(gt,preds)
print('Test Confusion Matrix:')
print(test_cm)
roc(gt,preds)

test_pred_labels=[]
id_arr=np.zeros((preds.shape[0]),dtype=np.int64)
for i in range(preds.shape[0]):
  for k,v in labels_vocab.items():
    if v==preds[i]:
      test_pred_labels.append(k)
  id_arr[i]=i+1

test_pred_labels=np.array(test_pred_labels)
out_arr=np.column_stack((id_arr,test_pred_labels))
out_df=pd.DataFrame(data=out_arr,columns=['ID','Class'])
if isinstance(model,RNNSentimentAnalysis):
  out_df.to_csv('test_results_rnn.csv',index=False)
else:
  out_df.to_csv('test_results_lstm.csv',index=False)