In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset from the given path
labeled_data_path = '../semi_supervised_data/labeled_data.csv'
df = pd.read_csv(labeled_data_path)

# Encode labels
label_encoder = LabelEncoder()
df['Stance'] = label_encoder.fit_transform(df['Stance'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Stance'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_vec, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vec, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Define the neural network
class StanceClassificationNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(StanceClassificationNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

input_dim = X_train_tensor.shape[1]
hidden_dim = 128
output_dim = len(label_encoder.classes_)

model = StanceClassificationNN(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
batch_size = 4

train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if i % 10 == 9:    # Print every 10 batches
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {running_loss / 10:.4f}')
            running_loss = 0.0

print("Training completed.")

# Test on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
    print(f'Accuracy: {accuracy:.4f}')

# Convert predictions back to textual labels
predicted_labels = label_encoder.inverse_transform(predicted.numpy())

# Output the predictions for the test set
for text, prediction in zip(X_test, predicted_labels):
    print(f'Text: "{text}" is predicted as "{prediction}"')


Epoch [1/10], Step [10/163], Loss: 0.6929
Epoch [1/10], Step [20/163], Loss: 0.6923
Epoch [1/10], Step [30/163], Loss: 0.6909
Epoch [1/10], Step [40/163], Loss: 0.6893
Epoch [1/10], Step [50/163], Loss: 0.6933
Epoch [1/10], Step [60/163], Loss: 0.6859
Epoch [1/10], Step [70/163], Loss: 0.6853
Epoch [1/10], Step [80/163], Loss: 0.6836
Epoch [1/10], Step [90/163], Loss: 0.6860
Epoch [1/10], Step [100/163], Loss: 0.6717
Epoch [1/10], Step [110/163], Loss: 0.6589
Epoch [1/10], Step [120/163], Loss: 0.6896
Epoch [1/10], Step [130/163], Loss: 0.6753
Epoch [1/10], Step [140/163], Loss: 0.6637
Epoch [1/10], Step [150/163], Loss: 0.6643
Epoch [1/10], Step [160/163], Loss: 0.6527
Epoch [2/10], Step [10/163], Loss: 0.5569
Epoch [2/10], Step [20/163], Loss: 0.4958
Epoch [2/10], Step [30/163], Loss: 0.4980
Epoch [2/10], Step [40/163], Loss: 0.4479
Epoch [2/10], Step [50/163], Loss: 0.4392
Epoch [2/10], Step [60/163], Loss: 0.4692
Epoch [2/10], Step [70/163], Loss: 0.3343
Epoch [2/10], Step [80/163]