# Creating a Neural Network (MLP) to Classify Tweets

In [12]:
import torch
from torch import nn, optim
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from pandas import DataFrame

In [13]:
class TweetClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TweetClassifier, self).__init__()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.hidden2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.hidden(x))
        out = self.sigmoid(self.hidden2(x))
        return out

In [14]:
args = {
    'input_size': 300,
    'hidden_size': 600,
    'output_size': 1,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'epochs': 100,
    'batch_size': 64,
    'learning_rate': 0.001
}

In [15]:
args['device']

'cuda'

## Transformations To Pass To MLP

In [16]:
from pandas import DataFrame
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import SnowballStemmer

def drop_na(df: DataFrame) -> None:
    """
    Drop rows that have NA values.
    Args:
        df: DataFrame containing data to be dropped.

    Returns:

    """
    df.dropna(inplace=True)


def to_lower(df: DataFrame, column_text: str) -> None:
    """
    Convert text to lowercase.
    Args:
        df: DataFrame containing text to lowercase.
        column_text: Column containing text to lowercase.

    Returns:

    """
    words_processed = []
    for tweet in df[column_text]:
        words_processed.append(tweet.lower())
    df[column_text] = words_processed


def filter_label(df: DataFrame, column_label: str) -> None:
    """
    Filter out labels from a dataframe column.
    Args:
        df: DataFrame containing column to filter.
        column_label: Column containing label to filter.

    Returns:

    """
    query = (df[column_label] == 'Positive') | (df[column_label] == 'Negative')
    df.drop(df[~query].index, inplace=True)


def map_label(df: DataFrame, column_label: str) -> None:
    """
    Map labels from a dataframe column.
    Args:
        df: DataFrame containing column to map.
        column_label: Column containing label to map.

    Returns:

    """
    df[column_label] = df[column_label].map({'Positive': 1, 'Negative': 0})


def tokenize_column(df: DataFrame, tokenizer: WordPunctTokenizer, column_text: str, column_tokens: str) -> None:
    """
    Tokenize a column using a tokenizer.
    Args:
        df: DataFrame containing column to tokenize.
        tokenizer: Tokenizer object.
        column_text: Column containing text to tokenize.
        column_tokens: Column to insert tokens in.

    Returns:

    """
    df[column_tokens] = df[column_text].apply(tokenizer.tokenize)


def remove_stopwords(df: DataFrame, column_tokens: str, column_clean_text: str, stop_words: list) -> None:
    """
    Remove stopwords from a dataframe column.
    Args:
        df: DataFrame containing column to remove stopwords from.
        column_tokens: Column containing tokens to remove stopwords from.
        column_clean_text: Column containing text cleaned.
        stop_words: List of stopwords to remove.

    Returns:

    """
    words_processed = []
    for tweet in df[column_tokens]:
        clean_tokens = [word for word in tweet if word not in stop_words]
        words_processed.append(clean_tokens)
    df[column_tokens] = words_processed
    df[column_clean_text] = [' '.join(tokens) for tokens in df[column_tokens]]


def remove_punctuation_tokens(df: DataFrame, column_tokens: str, column_clean_text: str) -> None:
    """
    Remove punctuation tokens from a dataframe column.
    Args:
        df: DataFrame containing column to remove punctuation tokens from.
        column_tokens: Column containing tokens to remove punctuation from.
        column_clean_text: Column containing text cleaned.

    Returns:

    """
    words_processed = []
    for tweet in df[column_tokens]:
        clean_tokens = [word for word in tweet if word.isalpha()]
        words_processed.append(clean_tokens)
    df[column_tokens] = words_processed
    df[column_clean_text] = [' '.join(tokens) for tokens in df[column_tokens]]


def stemming_tokens(df: DataFrame, stemmer: SnowballStemmer, column_tokens: str, column_clean_text: str) -> None:
    """
    Stemming tokens from a dataframe column.
    Args:
        df: DataFrame containing column to apply stemming.
        stemmer: Stemmer object.
        column_tokens: Column containing tokens to apply stemming.
        column_clean_text: Column to insert text cleaned.

    Returns:

    """
    tokens_stemmed = []
    for tokens in df[column_tokens]:
        tokens = [stemmer.stem(token) for token in tokens]
        tokens_stemmed.append(tokens)

    df[column_tokens] = tokens_stemmed
    df[column_clean_text] = [' '.join(tokens) for tokens in df[column_tokens]]


def process_tweets(df: DataFrame, tokenizer: WordPunctTokenizer, stemmer: SnowballStemmer, stop_words: list, column_text: str, column_clean_text: str, column_tokens: str, column_label: str) -> None:
    """
    Apply transformations to a dataframe.
    Args:
        df: DataFrame containing column to transform.
        tokenizer: Tokenizer object.
        stemmer: Stemmer object.
        stop_words: Stop words to remove.
        column_text: Column containing text to tokenize.
        column_clean_text: Column containing text cleaned.
        column_tokens: Column to insert tokens in.
        column_label: Column containing label.

    Returns:

    """
    drop_na(df)
    to_lower(df, column_text)
    filter_label(df, column_label)
    map_label(df, column_label)
    tokenize_column(df, tokenizer, column_text,column_tokens)
    remove_stopwords(df, column_tokens, column_clean_text, stop_words)
    remove_punctuation_tokens(df, column_tokens, column_clean_text)
    stemming_tokens(df, stemmer, column_tokens, column_clean_text)


def vectorize_data(df: DataFrame, vectorizer: TfidfVectorizer, column_text, is_train: bool = False) -> list:
    """
    Vectorize a dataframe column.
    Args:
        df: DataFrame containing column to vectorize.
        vectorizer: Vectorizer object.
        column_text: Column containing text to vectorize.
        is_train: Variable to control how vectorizing is fited (if data is to train or test).

    Returns:

    """
    return vectorizer.fit_transform(df[column_text]).toarray().tolist() if is_train else vectorizer.transform(df[column_text]).toarray().tolist()

def drop_not_text_or_label(df: DataFrame, column_text: str, column_label: str) -> DataFrame:
    return df.loc[:, [column_text, column_label]]

def rename_columns(df: DataFrame, column_text: str, column_label: str, column_text_new: str, column_label_new: str) -> DataFrame:
    df.rename(columns={column_text: column_text_new, column_label: column_label_new}, inplace=True)

def to_Tensor(df: DataFrame, column_text: str, column_label: str):
    text_tensor = []
    label_tensor = []
    for text_vec, label in zip(df[column_text].values, df[column_label].values):
        text_vec = torch.tensor(text_vec).float()
        label = torch.tensor(label).float()
        text_tensor.append(text_vec)
        label_tensor.append([label])
    df[column_text] = text_tensor
    df[column_label] = label_tensor
    x = torch.stack(df[column_text].tolist())
    label = torch.tensor(df[column_label].tolist())
    return x, label

## Tranformer Function

In [17]:
def transformer(df: DataFrame, tokenizer: WordPunctTokenizer,stemmer: SnowballStemmer, vectorizer: TfidfVectorizer, stop_words: list, column_text: str, column_clean_text: str, column_tokens: str, column_label: str, is_train: bool):
    process_tweets(df, tokenizer, stemmer, stop_words, column_text, column_clean_text, column_tokens, column_label)
    drop_not_text_or_label(df, column_clean_text, column_label)
    rename_columns(df, column_clean_text, column_label, 'text', 'label')
    df['text'] = vectorize_data(df, vectorizer, 'text', is_train=is_train)
    x, y = to_Tensor(df, 'text', 'label')
    return x, y

## Init Tokenizer, Stemmer e StopWords

In [18]:
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer('english')
stop_words_en = stopwords.words('english')

In [19]:
data = pd.read_csv('data/twitter_training.csv', names=['id', 'entity', 'sentiment', 'tweet'])
process_tweets(data, tokenizer, stemmer, stop_words_en, 'tweet', 'tweet_clean', 'tweet_tokens', 'sentiment')
data

Unnamed: 0,id,entity,sentiment,tweet,tweet_tokens,tweet_clean
0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,"[im, get, borderland, murder]",im get borderland murder
1,2401,Borderlands,1,i am coming to the borders and i will kill you...,"[come, border, kill]",come border kill
2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,"[im, get, borderland, kill]",im get borderland kill
3,2401,Borderlands,1,im coming on borderlands and i will murder you...,"[im, come, borderland, murder]",im come borderland murder
4,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...,"[im, get, borderland, murder]",im get borderland murder
...,...,...,...,...,...,...
74677,9200,Nvidia,1,just realized that the windows partition of my...,"[realiz, window, partit, mac, like, year, behi...",realiz window partit mac like year behind nvid...
74678,9200,Nvidia,1,just realized that my mac window partition is ...,"[realiz, mac, window, partit, year, behind, nv...",realiz mac window partit year behind nvidia dr...
74679,9200,Nvidia,1,just realized the windows partition of my mac ...,"[realiz, window, partit, mac, year, behind, nv...",realiz window partit mac year behind nvidia dr...
74680,9200,Nvidia,1,just realized between the windows partition of...,"[realiz, window, partit, mac, like, year, behi...",realiz window partit mac like year behind nvid...


In [20]:
data = drop_not_text_or_label(data, 'tweet_clean', 'sentiment')
data

Unnamed: 0,tweet_clean,sentiment
0,im get borderland murder,1
1,come border kill,1
2,im get borderland kill,1
3,im come borderland murder,1
4,im get borderland murder,1
...,...,...
74677,realiz window partit mac like year behind nvid...,1
74678,realiz mac window partit year behind nvidia dr...,1
74679,realiz window partit mac year behind nvidia dr...,1
74680,realiz window partit mac like year behind nvid...,1


In [21]:
rename_columns(data, 'tweet_clean', 'sentiment', 'text', 'label')
data

Unnamed: 0,text,label
0,im get borderland murder,1
1,come border kill,1
2,im get borderland kill,1
3,im come borderland murder,1
4,im get borderland murder,1
...,...,...
74677,realiz window partit mac like year behind nvid...,1
74678,realiz mac window partit year behind nvidia dr...,1
74679,realiz window partit mac year behind nvidia dr...,1
74680,realiz window partit mac like year behind nvid...,1


In [22]:
vectorizer = TfidfVectorizer(lowercase=False, max_features=args['input_size'], ngram_range=(1, 2))
data['text'] = vectorize_data(data, vectorizer, 'text', is_train=True)
data

Unnamed: 0,text,label
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
...,...,...
74677,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
74678,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
74679,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
74680,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


In [23]:
x, label = to_Tensor(data, 'text', 'label')

## Creating TweetDataset

In [24]:
from torch.utils.data import Dataset


class TweetDataset(Dataset):
    def __init__(self, csv_file, transform, tokenizer: WordPunctTokenizer, stemmer: SnowballStemmer,
                 vectorizer: TfidfVectorizer, stop_words: list, column_text: str, column_clean_text: str,
                 column_tokens: str, column_label: str, is_train=True):
        self.data = pd.read_csv(csv_file, names=['id', 'entity', 'sentiment', 'tweet'])
        self.labels = self.data[column_label].values
        self.transform = transform

        self.data, self.labels = transform(self.data, tokenizer, stemmer, vectorizer, stop_words, column_text,
                                           column_clean_text, column_tokens, column_label, is_train)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [25]:
csv_file = 'data/twitter_training.csv'
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer('english')
vectorizer = TfidfVectorizer(lowercase=False, max_features=args['input_size'], ngram_range=(1, 2))
stop_words_en = stopwords.words('english')
column_text, column_clean_text, column_tokens, column_label = 'tweet', 'tweet_clean', 'tweet_tokens', 'sentiment'

In [26]:
dataset_train = TweetDataset(csv_file, transformer, tokenizer, stemmer, vectorizer, stop_words_en, column_text,
                             column_clean_text, column_tokens, column_label, is_train=True)

In [27]:
csv_file = 'data/twitter_validation.csv'
dataset_test = TweetDataset(csv_file, transformer, tokenizer, stemmer, vectorizer, stop_words_en, column_text,
                            column_clean_text, column_tokens, column_label, is_train=False)

In [28]:
dataset_test[0]

(tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0

## Testing if model is working

In [29]:
model = TweetClassifier(args['input_size'], args['hidden_size'], args['output_size']).to(args['device'])
pred = model(dataset_train[:10][0].to(args['device']))
print(pred)

tensor([[0.5014],
        [0.5068],
        [0.5041],
        [0.5015],
        [0.5014],
        [0.5014],
        [0.5098],
        [0.5079],
        [0.5048],
        [0.5115]], device='cuda:0', grad_fn=<SigmoidBackward0>)


## Creating DataLoader to use batch

In [30]:
from torch.utils.data import DataLoader

dataloader_train = DataLoader(dataset_train, batch_size=args['batch_size'], shuffle=True)
dataloader_test = DataLoader(dataset_test, batch_size=args['batch_size'], shuffle=True)

## Training Workflow

In [31]:
## Training Worklflow
def train(model, dataloader_train, optimizer, loss_fn, epochs):
    model.train()
    for epoch in range(epochs):
        print(f'========== Epoch {epoch + 1} ==========')
        for batch, (X, y) in enumerate(dataloader_train):
            X, y = X.to(args['device']), y.to(args['device'])
            # Forward
            optimizer.zero_grad()
            y_pred = model(X)
            loss = loss_fn(y_pred, y)

            # Backpropagation
            loss.backward()
            optimizer.step()

            print(f'Loss Batch {batch}: {loss.item()}')

## Evaluate Workflow

In [32]:
from sklearn.metrics import accuracy_score

def evaluate(model, dataloader_test):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for features, labels in dataloader_test:
            features, labels = features.to(args['device']), labels.to(args['device'])
            outputs = model(features)
            predicted =  (outputs >= 0.5).int()
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            break
    acc = accuracy_score(all_labels, all_preds)
    print(f"Acurácia no teste: {acc * 100:.2f}%")

## Training Model

In [33]:
optimizer = optim.Adam(model.parameters(), lr=args['learning_rate'])
lss_fn = nn.BCELoss()
train(model, dataloader_train, optimizer, lss_fn, args['epochs'])

Loss Batch 0: 0.6954160928726196
Loss Batch 1: 0.6925884485244751
Loss Batch 2: 0.6906206607818604
Loss Batch 3: 0.6946675181388855
Loss Batch 4: 0.6921106576919556
Loss Batch 5: 0.6900946497917175
Loss Batch 6: 0.6906937956809998
Loss Batch 7: 0.6883269548416138
Loss Batch 8: 0.6878341436386108
Loss Batch 9: 0.6890479922294617
Loss Batch 10: 0.6889387369155884
Loss Batch 11: 0.6874781847000122
Loss Batch 12: 0.6756125688552856
Loss Batch 13: 0.679186224937439
Loss Batch 14: 0.6842572093009949
Loss Batch 15: 0.6765538454055786
Loss Batch 16: 0.6796053051948547
Loss Batch 17: 0.6782077550888062
Loss Batch 18: 0.6857430338859558
Loss Batch 19: 0.6757608652114868
Loss Batch 20: 0.672247052192688
Loss Batch 21: 0.6818002462387085
Loss Batch 22: 0.6696769595146179
Loss Batch 23: 0.6746221780776978
Loss Batch 24: 0.6613405346870422
Loss Batch 25: 0.6726464629173279
Loss Batch 26: 0.6730334758758545
Loss Batch 27: 0.6581214666366577
Loss Batch 28: 0.6642072796821594
Loss Batch 29: 0.645442843

## Evaluate Model

In [34]:
evaluate(model, dataloader_test)

Acurácia no teste: 96.88%
