## Import Dependencies

In [1]:
import torch
import torchvision # provide access to datasets, models, transforms, utils, etc
import torchvision.transforms as transforms

In [2]:
import numpy as np
import pandas as pd 
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import nltk

In [3]:
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

## Data Preprocessing

### Load dataset

In [4]:
data=pd.read_csv('dataset/tweets.csv')

In [5]:
data.isnull().sum()

tweets    0
labels    0
dtype: int64

### Removing words from stopword list to consider certain words in final model

In [6]:
#get stopword list
sw_list=stopwords.words('english')

In [7]:
print(f"Not----->{'not' in sw_list}\nNo------>{'no' in sw_list}\nAgainst->{'against' in sw_list}")

Not----->True
No------>True
Against->True


In [8]:
sw_list.remove('not')
sw_list.remove('no')
sw_list.remove('against')

In [9]:
print(f"Not----->{'not' in sw_list}\nNo------>{'no' in sw_list}\nAgainst->{'against' in sw_list}")

Not----->False
No------>False
Against->False


### Functions to clean data

In [10]:
#Initialize lemmatizer
lemmatizer=WordNetLemmatizer()

In [11]:
#Get nltk tag for lemmatizing a given token
def nltk_wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:                    
    return None

In [12]:
# Function to lemmatize a sentence
def lemmatize_sentence(sentence):
    tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    wordnet_tagged = map(lambda x: (x[0], nltk_wn_tag(x[1])), tagged)
    lemmatized_words=[]
    for word, tag in wordnet_tagged:
        if word not in sw_list:
            if tag is None:                        
                lemmatized_words.append(word)
            else:
                lemmatized_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_words)

In [13]:
def clean_tweets(tweet):
    #remove RT(retweet tag)
    tweet=re.sub(r"RT @","@",tweet)
    #remove pings
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    #remove URLs
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    #remove special characters
    tweet = re.sub(r"[^a-zA-Z]", ' ', tweet)
    #convert to lower case
    tweet=tweet.lower()
    #remove extra white spaces
    tweet = re.sub(r" +",' ', tweet)
    #lemmatization of tweet and removing stopwords
    lemmatized_tweet=lemmatize_sentence(tweet)
    #remove extra white spaces
    tweet = re.sub(r" +", ' ', tweet)
    return lemmatized_tweet.strip()

### Clean text data

In [14]:
data['tweets']=[clean_tweets(tweet) for tweet in data['tweets']]

### Removing strings left empty after cleaning

In [15]:
data.eq('').sum()

tweets    7
labels    0
dtype: int64

In [16]:
data.replace("",np.nan,inplace=True)
data.dropna(subset=['tweets'],inplace=True)
data.eq('').sum()

tweets    0
labels    0
dtype: int64

In [17]:
len(data)

1357

### Train test split

In [18]:
from sklearn.model_selection import train_test_split
X=data['tweets']
y=data['labels']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)

In [19]:
lengths=pd.DataFrame(len(tweet.split(' ')) for tweet in X_train)

In [20]:
test_length=pd.DataFrame(len(tweet.split(' ')) for tweet in X_test)

In [21]:
max_len=max(len(tweet.split(' ')) for tweet in X_train)

### TF-IDF Vectorization

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect=TfidfVectorizer()
X_train_tfidf=tfidf_vect.fit_transform(X_train).toarray()
X_test_tfidf=tfidf_vect.transform(X_test).toarray()
X_train_tfidf=pd.DataFrame(X_train_tfidf)
X_test_tfidf=pd.DataFrame(X_test_tfidf)

### Combined training data

In [23]:
train_data=pd.concat((X_train,y_train),axis=1,ignore_index=True)
train_data.columns=['tweets','labels']
train_data.reset_index(inplace=True,drop=True)
train_data=pd.concat((train_data,X_train_tfidf),axis=1)
train_data

Unnamed: 0,tweets,labels,0,1,2,3,4,5,6,7,...,2458,2459,2460,2461,2462,2463,2464,2465,2466,2467
0,obama admin cry tax increase applaud china low...,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,barack obama longboard package core truck mm b...,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,edshow whenever obama tell truth gop boo hoo h...,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,many foreign leader obama promise post electio...,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,obama signal us would accept iranian civilian ...,1,0.0,0.0,0.0,0.0,0.0,0.000000,0.254869,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1080,mean saving scotus tell world obama wrong aca ...,0,0.0,0.0,0.0,0.0,0.0,0.335548,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1081,obama sharpen kansas vision,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1082,genius man sing else really obama,1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1083,mitt romney obama spend much time harvard also...,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Combine test data

In [24]:
test_data=pd.concat((X_test,y_test),axis=1,ignore_index=True)
test_data.columns=['tweets','labels']
test_data.reset_index(inplace=True,drop=True)
test_data=pd.concat((test_data,X_test_tfidf),axis=1)
test_data

Unnamed: 0,tweets,labels,0,1,2,3,4,5,6,7,...,2458,2459,2460,2461,2462,2463,2464,2465,2466,2467
0,harry style describe michelle obama como una m...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,amp si khady president obama get elect nyrell ...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,obama black american really need time whitey f...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,whatsromneyhiding throw baseball like man not ...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,realclearpolitics obama organizational advanta...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,romney attack obama barnstorm pennsylvania,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
268,obama thug bully not sure call rush limbaugh bill,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
269,michelle barack obama become like celebrity ta...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,whatsromneyhiding nothing compare mt proof oba...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.310191,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Convert dataframes 

In [25]:
X_train_tensor=torch.tensor(X_train_tfidf.values).float()
X_test_tensor=torch.tensor(X_test_tfidf.values).float()

In [26]:
y_train_tensor=torch.tensor(y_train.values).float()
y_test_tensor=torch.tensor(y_test.values).float()

# Feed Forward Network

## Define and create model

### Get input shape for Feed forward network

In [27]:
INPUT_SHAPE=X_train_tensor.shape[1]

In [28]:
INPUT_SHAPE

2468

In [29]:
import torch.nn as nn
import torch.nn.functional as F

In [30]:
### Define FFN class

In [31]:
class FFN(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1=nn.Linear(in_features=INPUT_SHAPE,out_features=64)
        self.dropout=nn.Dropout(p=0.2)
        self.fc2=nn.Linear(in_features=64,out_features=32)
        self.out=nn.Linear(in_features=32,out_features=3)
    
    def forward(self,t):
        # Input layer
        t=self.fc1(t)
        t=F.relu(t)

        # Dropout layer
        t=self.dropout(t)

        # Hidden layer
        t=self.fc2(t)
        t=F.relu(t)

        # Output layer
        t=self.out(t)

        return t

In [32]:
import torch.optim as optim

In [33]:
def get_num_correct(preds, labels):
  return preds.argmax(dim=1).eq(labels).sum().item()

### Create FFN object and define RMSprop optimizer

In [34]:
# Create FFN object
network=FFN()
# Define optimizer
optimizer=optim.RMSprop(network.parameters(),lr=0.02)

## Train model for 20 epochs

In [35]:
for epoch in range(20):
    preds=network(X_train_tensor)
    loss=F.cross_entropy(preds,y_train_tensor.long())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss=loss.item()
    total_correct=get_num_correct(preds,y_train_tensor)
    print(
        "Epoch = ",epoch,
        "Loss = ",total_loss,
        "Accuracy = ",total_correct/len(X_train)
    )

Epoch =  0 Loss =  1.1387605667114258 Accuracy =  0.25253456221198156
Epoch =  1 Loss =  1.5284608602523804 Accuracy =  0.6838709677419355
Epoch =  2 Loss =  5.097789764404297 Accuracy =  0.25253456221198156
Epoch =  3 Loss =  0.9535832405090332 Accuracy =  0.6838709677419355
Epoch =  4 Loss =  0.9050535559654236 Accuracy =  0.6866359447004609
Epoch =  5 Loss =  0.8634213805198669 Accuracy =  0.687557603686636
Epoch =  6 Loss =  0.820000410079956 Accuracy =  0.6930875576036867
Epoch =  7 Loss =  0.7771167159080505 Accuracy =  0.6940092165898617
Epoch =  8 Loss =  0.7115213871002197 Accuracy =  0.6967741935483871
Epoch =  9 Loss =  0.6041943430900574 Accuracy =  0.7907834101382488
Epoch =  10 Loss =  0.5318186283111572 Accuracy =  0.8267281105990784
Epoch =  11 Loss =  0.4325166344642639 Accuracy =  0.8654377880184332
Epoch =  12 Loss =  0.39344322681427 Accuracy =  0.8866359447004608
Epoch =  13 Loss =  0.33991849422454834 Accuracy =  0.8903225806451613
Epoch =  14 Loss =  0.2951522171

## Evaluate model

### Set model to eval

In [36]:
network.eval()

FFN(
  (fc1): Linear(in_features=2468, out_features=64, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=3, bias=True)
)

### Get predictions on test data

In [37]:
with torch.no_grad():
    final_preds=network(X_test_tensor)

In [38]:
get_num_correct(final_preds,y_test_tensor)/len(final_preds)

0.8088235294117647

In [39]:
len(y_test_tensor)

272

In [40]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [41]:
final_pred_labels=final_preds.argmax(dim=1)

In [42]:
print(accuracy_score(y_test_tensor,final_pred_labels))

0.8088235294117647


In [43]:
confusion_matrix(y_test_tensor,final_pred_labels)

array([[150,  32,   0],
       [ 12,  66,   0],
       [  3,   5,   4]], dtype=int64)

# LSTM 

In [44]:
INPUT_SHAPE

2468

In [45]:
INPUT_SIZE=1
SEQUENCE_LENGTH=INPUT_SHAPE
NUM_LAYERS=2

In [46]:
INPUT_SHAPE

2468

## Define LSTM NN class

In [47]:
class LSTM(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm1=nn.LSTM(INPUT_SHAPE,50,NUM_LAYERS,dropout=0.2,batch_first=True)
        
        self.dropout=nn.Dropout(p=0.2)
        self.output=nn.Linear(in_features=50,out_features=3)
    
    def forward(self,t):
        # Input layer
        # Setting initial cell state to 0
        h0=torch.zeros(NUM_LAYERS,t.size(0),50)
        c0=torch.zeros(NUM_LAYERS,t.size(0),50)
        # Getting LSTM output
        out, _ = self.lstm1(t,(h0,c0))

        #Only considering final sequence
        out=out[:,-1,:]

        # Output layer
        out=self.output(out)

        return out

## Reshape input to LSTM

In [48]:
X_train_tfidf=X_train_tfidf.values.reshape(X_train_tfidf.shape[0],1,X_train_tfidf.shape[1])

In [49]:
X_train_tensor=torch.tensor(X_train_tfidf).float()

## Define optimizer (RMSprop) and train for 20 epochs

In [50]:
# Create FFN object
LSTM_net=LSTM()
# Define optimizer
optimizer2=optim.RMSprop(LSTM_net.parameters(),lr=0.02)
for epoch in range(20):
    preds=LSTM_net(X_train_tensor)
    loss=F.cross_entropy(preds,y_train_tensor.long())
    optimizer2.zero_grad()
    loss.backward()
    optimizer2.step()
    total_loss=loss.item()
    total_correct=get_num_correct(preds,y_train_tensor)
    print(
        "Epoch = ",epoch+1,
        "Loss = ",total_loss,
        "Accuracy = ",total_correct/len(X_train)
    )

Epoch =  1 Loss =  1.1335217952728271 Accuracy =  0.06359447004608294
Epoch =  2 Loss =  1.485162615776062 Accuracy =  0.6838709677419355
Epoch =  3 Loss =  6.094876766204834 Accuracy =  0.059907834101382486
Epoch =  4 Loss =  0.9940123558044434 Accuracy =  0.7889400921658987
Epoch =  5 Loss =  0.6427159905433655 Accuracy =  0.7677419354838709
Epoch =  6 Loss =  0.45841357111930847 Accuracy =  0.8525345622119815
Epoch =  7 Loss =  0.34178319573402405 Accuracy =  0.919815668202765
Epoch =  8 Loss =  0.2759753465652466 Accuracy =  0.9235023041474655
Epoch =  9 Loss =  0.2301580160856247 Accuracy =  0.9308755760368663
Epoch =  10 Loss =  0.19633929431438446 Accuracy =  0.9345622119815669
Epoch =  11 Loss =  0.17607669532299042 Accuracy =  0.9345622119815669
Epoch =  12 Loss =  0.1581566035747528 Accuracy =  0.9354838709677419
Epoch =  13 Loss =  0.1462983638048172 Accuracy =  0.9354838709677419
Epoch =  14 Loss =  0.13609574735164642 Accuracy =  0.9345622119815669
Epoch =  15 Loss =  0.12

In [51]:
X_test_tfidf=X_test_tfidf.values.reshape(X_test_tfidf.shape[0],1,X_test_tfidf.shape[1])
X_test_tensor=torch.tensor(X_test_tfidf).float()

## Evaluate LSTM model

In [52]:
LSTM_net.eval()

LSTM(
  (lstm1): LSTM(2468, 50, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (output): Linear(in_features=50, out_features=3, bias=True)
)

In [53]:
with torch.no_grad():
    LSTM_preds=LSTM_net(X_test_tensor)

In [54]:
LSTM_preds=LSTM_preds.argmax(dim=1)

In [55]:
print(accuracy_score(y_test_tensor,LSTM_preds))

0.8419117647058824


In [56]:
confusion_matrix(y_test_tensor,final_pred_labels)

array([[150,  32,   0],
       [ 12,  66,   0],
       [  3,   5,   4]], dtype=int64)