In [26]:
import torch
import torch.nn as nn
from MyTransformer import Classifier
import pandas as pd
import numpy as np
from preprocessing import preprocess
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [27]:
max_tokens = 128
emb_size = 512

In [28]:
model = Classifier(vocab_size=23174, d_model=512, num_heads=8, num_layers=6, d_ff=2048, max_seq_length=128)
rand_input = torch.randint(0, 21000, (16,max_tokens))


In [29]:
def build_filter_vocab(data, min_count=5):
    vocab = {}
    for sentence in data:
        for word in sentence.split():
            vocab[word] = vocab.get(word, 0) + 1
    filtered_vocab = {word: count for word, count in vocab.items() if count >= min_count}
    return filtered_vocab

def build_tokenized_vocab(vocab:dict):
    voc = {word: idx for idx, (word, _) in enumerate(vocab.items())}
    voc['<UNK>'] = len(voc)
    voc['<PAD>'] = len(voc)
    return voc

def pad_lists_in_df_column(df, column_name, desired_length, padding_value):
    """
    Pads lists in a specified column of a DataFrame to a desired length.

    Parameters:
    - df: The DataFrame to process.
    - column_name: The name of the column containing lists to pad.
    - desired_length: The desired length of the lists.
    - padding_value: The value to use for padding shorter lists.

    Returns:
    - A new DataFrame with padded lists in the specified column.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    
    # Pad each list in the specified column
    df_copy[column_name] = df_copy[column_name].apply(
        lambda x: x + [padding_value] * (desired_length - len(x)) if len(x) < desired_length else x
    )
    
    return df_copy

class DataFrameDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Assuming the DataFrame has two columns: 'features' and 'labels'
        # Adjust this method based on the actual structure of your DataFrame
        features = self.dataframe.iloc[idx, :-1].values # All columns except the last one
        label = self.dataframe.iloc[idx, -1] # Last column
        return torch.tensor(features, dtype=torch.int), torch.tensor(label, dtype=torch.float)

# Step 2: Function to create a DataLoader from a DataFrame
def create_dataloader_from_df(dataframe, batch_size=32, shuffle=True):
    dataset = DataFrameDataset(dataframe)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

def train_model(model, dataloader, n_epochs, loss_fn=nn.BCEWithLogitsLoss(), optimizer=torch.optim.Adam(model.parameters(), lr=0.0001)):
    # Check if CUDA is available and move the model to GPU if it is
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()  # Set the model to training mode
    for epoch in tqdm(range(n_epochs)):
        total_loss = 0
        for batch in dataloader:
            inputs, labels = batch
            # Move data to the same device as the model
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()  # Clear the gradients
            outputs = model(inputs)  # Forward pass
            loss = loss_fn(outputs.squeeze(), labels)  # Compute the loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update model parameters
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

In [30]:
df_pos = pd.read_csv('preprocessed_pos.csv')
df_neg = pd.read_csv('preprocessed_neg.csv')
df_pos['label'] = 1
df_neg['label'] = 0
neg_tweets=df_neg['hashtags_written_out'].values
pos_tweets=df_pos['hashtags_written_out'].values
all_tweets = np.concatenate((neg_tweets, pos_tweets), axis=0)

In [31]:
vocab = build_filter_vocab(all_tweets, 4)
tokenized_vocab = build_tokenized_vocab(vocab)
print('vocab size:', len(tokenized_vocab))
token_tweets_neg = []
for tweet in neg_tweets:
    work_tweet = []
    for word in tweet.split():
        if word in tokenized_vocab:
            work_tweet.append(tokenized_vocab[word])
        else:
            work_tweet.append(tokenized_vocab['<UNK>'])
    token_tweets_neg.append(work_tweet)
token_tweets_pos = []
for tweet in pos_tweets:
    work_tweet = []
    for word in tweet.split():
        if word in tokenized_vocab:
            work_tweet.append(tokenized_vocab[word])
        else:
            work_tweet.append(tokenized_vocab['<UNK>'])
    token_tweets_pos.append(work_tweet)


vocab size: 23174


In [32]:
df_token_neg = pd.DataFrame(token_tweets_neg)
df_token_pos = pd.DataFrame(token_tweets_pos)
df_token_neg['label'] = 0
df_token_pos['label'] = 1
df_token = pd.concat([df_token_neg, df_token_pos])
df_token = df_token.fillna(tokenized_vocab['<PAD>'])
cols = [col for col in df_token.columns if col != 'label'] + ['label']
df_token = df_token[cols]
print(df_token)
dataloader = create_dataloader_from_df(df_token, batch_size=32, shuffle=True)


           0        1        2        3       4       5       6     7  \
0      23172  23172.0      0.0      1.0     2.0     0.0     3.0   0.0   
1         17     18.0     19.0     20.0    21.0    22.0    23.0  23.0   
2          0     27.0      0.0     28.0    29.0    30.0     9.0  31.0   
3         41     18.0     42.0     43.0    44.0    45.0    46.0  18.0   
4         41     41.0     41.0     18.0    50.0    45.0    51.0  52.0   
...      ...      ...      ...      ...     ...     ...     ...   ...   
99995     41    851.0  20693.0    143.0   128.0   129.0   202.0  85.0   
99996     41  11251.0      0.0     15.0     0.0     7.0     0.0  96.0   
99997     41     41.0   7964.0  23172.0    15.0    15.0    15.0  18.0   
99998     41     18.0     47.0     48.0  2207.0   215.0    22.0  23.0   
99999     18    338.0  11439.0    143.0     9.0  1870.0  5694.0  68.0   

             8        9  ...       58       59       60       61       62  \
0          4.0      5.0  ...  23173.0  23173.0

In [33]:
train_model(model, dataloader, 10)

  0%|          | 0/10 [00:00<?, ?it/s]