In [45]:
# Data wrangling
import pandas as pd
from sklearn.model_selection import train_test_split
# Reading the data 
d = pd.read_csv('Tweets.csv', header=None)

# Adding the columns 
d.columns = ['INDEX', 'GAME', "SENTIMENT", 'TEXT']

# Leaving only the positive and the negative sentiments 
d = d[d['SENTIMENT'].isin(['Positive', 'Negative'])]

# Encoding the sentiments that the negative will be 1 and the positive 0
d['SENTIMENT'] = d['SENTIMENT'].apply(lambda x: 0 if x == 'Positive' else 1)

# Dropping missing values
d = d.dropna()

In [46]:
def create_word_index(
    x: str, 
    shift_for_padding: bool = False, 
    char_level = False): 
    """
    Function that scans a given text and creates two dictionaries:
    - word2idx: dictionary mapping words to integers
    - idx2word: dictionary mapping integers to words

    Args:
        x (str): text to scan
        shift_for_padding (bool, optional): If True, the function will add 1 to all the indexes.
            This is done to reserve the 0 index for padding. Defaults to False.
        char_level (bool, optional): If True, the function will create a character level dictionary.
        
    Returns:
        Tuple[dict, dict]: word2idx and idx2word dictionaries
    """
    # Ensuring that the text is a string
    if not isinstance(x, str):
        try: 
            x = str(x)
        except:
            raise Exception('The text must be a string or a string convertible object')
        
    # Spliting the text into words
    words = []
    if char_level:
        # The list() function of a string will return a list of characters
        words = list(x)
    else:
        # Spliting the text into words by spaces
        words = x.split(' ')

    # Creating the word2idx dictionary 
    word2idx = {}
    for word in words: 
        if word not in word2idx: 
            # The len(word2idx) will always ensure that the 
            # new index is 1 + the length of the dictionary so far
            word2idx[word] = len(word2idx)

    # Adding the <UNK> token to the dictionary; This token will be used 
    # on new texts that were not seen during training.
    # It will have the last index. 
    word2idx['<UNK>'] = len(word2idx)

    if shift_for_padding:
        # Adding 1 to all the indexes; 
        # The 0 index will be reserved for padding
        word2idx = {k: v + 1 for k, v in word2idx.items()}

    # Reversing the above dictionary and creating the idx2word dictionary
    idx2word = {idx: word for word, idx in word2idx.items()}

    # Returns the dictionaries
    return word2idx, idx2word

In [47]:
# Spliting to train test 
train, test = train_test_split(d, test_size=0.2, random_state=42)

# Reseting the indexes 
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')


# Joining all the texts into one string
text = ' '.join(train['TEXT'].values)

# Creating the word2idx and idx2word dictionaries
word2idx, idx2word = create_word_index(text, shift_for_padding=True, char_level=True)

# Printing the size of the vocabulary
print(f'The size of the vocabulary is: {len(word2idx)}')

Train shape: (34410, 4)
Test shape: (8603, 4)
The size of the vocabulary is: 274


In [48]:
word2idx

{'I': 1,
 ' ': 2,
 'd': 3,
 'o': 4,
 'w': 5,
 'n': 6,
 'l': 7,
 'a': 8,
 'e': 9,
 'G': 10,
 'T': 11,
 'A': 12,
 '5': 13,
 'm': 14,
 'y': 15,
 'X': 16,
 'b': 17,
 'x': 18,
 'O': 19,
 '.': 20,
 'f': 21,
 'r': 22,
 'g': 23,
 't': 24,
 'h': 25,
 'i': 26,
 's': 27,
 'F': 28,
 '3': 29,
 'k': 30,
 '2': 31,
 'u': 32,
 'c': 33,
 'v': 34,
 'p': 35,
 'j': 36,
 '/': 37,
 '?': 38,
 '=': 39,
 '-': 40,
 'H': 41,
 'U': 42,
 '0': 43,
 '…': 44,
 'P': 45,
 'L': 46,
 'N': 47,
 'M': 48,
 'E': 49,
 ',': 50,
 '!': 51,
 'R': 52,
 'D': 53,
 'S': 54,
 'B': 55,
 '@': 56,
 '’': 57,
 '8': 58,
 '4': 59,
 'J': 60,
 'C': 61,
 '&': 62,
 'Я': 63,
 'с': 64,
 'М': 65,
 'о': 66,
 'я': 67,
 'к': 68,
 'м': 69,
 'п': 70,
 'а': 71,
 'н': 72,
 'и': 73,
 'Y': 74,
 '“': 75,
 '”': 76,
 'z': 77,
 "'": 78,
 ':': 79,
 'K': 80,
 'V': 81,
 '1': 82,
 '9': 83,
 ')': 84,
 '6': 85,
 'q': 86,
 '(': 87,
 'Z': 88,
 '"': 89,
 '_': 90,
 '7': 91,
 'W': 92,
 '–': 93,
 '🥺': 94,
 'Q': 95,
 '<': 96,
 '>': 97,
 '🤣': 98,
 '+': 99,
 '[': 100,
 ']': 10

In [23]:
train.head()

Unnamed: 0,INDEX,GAME,SENTIMENT,TEXT
0,5166,GrandTheftAuto(GTA),0,I downloaded GTA 5 on my new Xbox One. I forgo...
1,2785,Borderlands,1,For more then 3 weeks now 2k uk hasnt drawn a ...
2,1585,Battlefield,0,. PLATINUM NAME. .. running back through and ...
3,10484,RedDeadRedemption(RDR),0,Red Dead or Redemption is just about Super Mar...
4,3385,Facebook,0,Hey @Facebook this showed up on my feed recent...


In [49]:
# For each row in the train and test set, we will create a list of integers
# that will represent the words in the text
train['text_int'] = train['TEXT'].apply(lambda x: [word2idx.get(word, word2idx['<UNK>']) for word in list(x)])
test['text_int'] = test['TEXT'].apply(lambda x: [word2idx.get(word, word2idx['<UNK>']) for word in list(x)])

# Calculating the length of sequences in the train set 
train['seq_len'] = train['text_int'].apply(lambda x: len(x))

# Describing the length of the sequences
train['seq_len'].describe()

count    34410.000000
mean       103.600262
std         79.972798
min          1.000000
25%         41.000000
50%         83.000000
75%        148.000000
max        727.000000
Name: seq_len, dtype: float64

In [61]:
''.join(idx2word.get(character) for character in train['text_int'][0])


'I downloaded GTA 5 on my new Xbox One. I forgot the old it is lol.'

In [31]:
train.head()

Unnamed: 0,INDEX,GAME,SENTIMENT,TEXT,text_int,seq_len
0,5166,GrandTheftAuto(GTA),0,I downloaded GTA 5 on my new Xbox One. I forgo...,"[1, 2, 3, 4, 5, 6, 7, 4, 8, 3, 9, 3, 2, 10, 11...",66
1,2785,Borderlands,1,For more then 3 weeks now 2k uk hasnt drawn a ...,"[28, 4, 22, 2, 14, 4, 22, 9, 2, 24, 25, 9, 6, ...",230
2,1585,Battlefield,0,. PLATINUM NAME. .. running back through and ...,"[20, 2, 45, 46, 12, 11, 1, 47, 42, 48, 2, 47, ...",205
3,10484,RedDeadRedemption(RDR),0,Red Dead or Redemption is just about Super Mar...,"[52, 9, 3, 2, 53, 9, 8, 3, 2, 4, 22, 2, 52, 9,...",62
4,3385,Facebook,0,Hey @Facebook this showed up on my feed recent...,"[41, 9, 15, 2, 56, 28, 8, 33, 9, 17, 4, 4, 30,...",268


In [40]:
train.drop('text_int_size', inplace=True, axis=1)

In [41]:
train.columns

Index(['INDEX', 'GAME', 'SENTIMENT', 'TEXT', 'text_int', 'seq_len'], dtype='object')

In [38]:
def pad_sequences(x: list, pad_length: int) -> list:
    """
    Function that pads a given list of integers to a given length

    Args:
        x (list): list of integers to pad
        pad_length (int): length to pad

    Returns:
        list: padded list of integers
    """
    # Getting the length of the list
    len_x = len(x)

    # Checking if the length of the list is less than the pad_length
    if len_x < pad_length: 
        # Padding the list with 0s
        x = x + [0] * (pad_length - len_x)
    else: 
        # Truncating the list to the desired length
        x = x[:pad_length]

    # Returning the padded list
    return x

# Padding the train and test sequences 
train['text_int'] = train['text_int'].apply(lambda x: pad_sequences(x, 200))
test['text_int'] = test['text_int'].apply(lambda x: pad_sequences(x, 200))

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [42]:
class SentimentClassifier(torch.nn.Module):
    """
    Class that defines the sentiment classifier model
    """
    def __init__(self, vocab_size, embedding_dim):
        super(SentimentClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=1, batch_first=True)
        self.fc = nn.Linear(1, 1)  # Output with a single neuron for binary classification
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation

    def forward(self, x):
        x = self.embedding(x)  # Embedding layer
        output, _ = self.lstm(x)  # RNN layer

        # Use the short term memory from the last time step as the representation of the sequence
        x = output[:, -1, :]

        # Fully connected layer with a single neuron
        x = self.fc(x) 
        
        # Converting to probabilities
        x = self.sigmoid(x)

        # Flattening the output
        x = x.squeeze()
        
        return x

# Initiating the model 
model = SentimentClassifier(vocab_size=len(word2idx), embedding_dim=16)

# Initiating the criterion and the optimizer
criterion = nn.BCELoss() # Binary cross entropy loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [43]:
from torch.utils.data import Dataset, DataLoader

class TextClassificationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # The x is named as text_int and the y as airline_sentiment
        x = self.data.iloc[idx]['text_int']
        y = self.data.iloc[idx]['SENTIMENT']
        
        # Converting the x and y to torch tensors
        x = torch.tensor(x)
        y = torch.tensor(y)

        # Converting the y variable to float 
        y = y.float()

        # Returning the x and y
        return x, y
    
# Creating the train and test loaders
train_loader = DataLoader(TextClassificationDataset(train), batch_size=32, shuffle=True)
test_loader = DataLoader(TextClassificationDataset(test), batch_size=32, shuffle=True)

In [44]:
# Defining the number of epochs
epochs = 100

# Setting the model to train mode
model.train()

# Saving of the loss values
losses = []

# Iterating through epochs
for epoch in range(epochs):
    # Initiating the total loss 
    total_loss = 0

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        # Zero the gradients
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass

        loss = criterion(outputs, labels)  # Compute the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update the model's parameters

        # Adding the loss to the total loss
        total_loss += loss.item()

    # Calculating the average loss
    avg_loss = total_loss / len(train_loader)

    # Appending the loss to the list containing the losses
    losses.append(avg_loss)

    # Printing the loss every n epochs
    if epoch % 20 == 0:
        print(f'Epoch: {epoch}, Loss: {avg_loss}')

Epoch: 0, Loss: 0.6926363456204921
Epoch: 20, Loss: 0.6528273011007274
Epoch: 40, Loss: 0.6473784191623939
Epoch: 60, Loss: 0.6437770354260299
Epoch: 80, Loss: 0.6422728939625854
