In [116]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

In [117]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [118]:
# loading train data
data = pd.read_csv("train_data.csv")

# dividing data into review and rating
reviews = data['review'].values
ratings = data['rating'].values

# dividing data into train and validation set
train_reviews, val_reviews, train_ratings, val_ratings = train_test_split(reviews, ratings, test_size=0.2)

In [119]:
# loading test data
test_data = pd.read_csv("test_data.csv")

test_reviews = test_data['review'].values

In [120]:
# prepering Bag of Words reprezentation
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_reviews)
# print(train_features)
val_features = vectorizer.transform(val_reviews)
# print(val_features)
test_features = vectorizer.transform(test_reviews)

# Przygotowanie reprezentacji TF-IDF
# vectorizer = TfidfVectorizer()
# train_features = vectorizer.fit_transform(train_reviews)
# val_features = vectorizer.transform(test_reviews)

In [121]:
# classes conversion to pytorch tensors
train_labels = torch.tensor(train_ratings - 1 + 1, device=device)
val_labels = torch.tensor(val_ratings - 1 + 1, device=device)

In [122]:
# calculating weight for each class
class_counts = data['rating'].value_counts().sort_index().values
print(class_counts)
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float, device=device)
print(class_weights)

[1137 1434 1747 4831 7243]
tensor([0.0009, 0.0007, 0.0006, 0.0002, 0.0001])


In [123]:
# class HotelClassifier(nn.Module):
#     def __init__(self, input_size, hidden_size, num_classes):
#         super(HotelClassifier, self).__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(hidden_size, num_classes)

#     def forward(self, x):
#         out = self.fc1(x)
#         out = self.relu(out)
#         out = self.fc2(out)
#         return out

In [124]:
class HotelClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(HotelClassifier, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        out = self.fc(output[:, -1, :])
        return out

In [125]:
# model parameters
input_size = train_features.shape[1]
hidden_size = 2
num_classes = 5

model = HotelClassifier(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [127]:
# training
num_epochs = 10
batch_size = 16

for epoch in range(num_epochs):
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for i in range(0, train_features.shape[0], batch_size):
        batch_features = train_features[i:i + batch_size]
        batch_labels = train_labels[i:i + batch_size]

        batch_features = torch.tensor(batch_features.toarray(), device=device).long()
        batch_labels = torch.tensor(batch_labels, device=device).long()

        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        # print(predicted)
        total_correct += (predicted == batch_labels).sum().item()
        total_samples += batch_labels.size(0)

    train_loss = total_loss / (train_features.shape[0] // batch_size)
    train_accuracy = total_correct / total_samples

    print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {train_loss}, Train Accuracy: {train_accuracy}')

save_path = "model_weights.pth"
torch.save(model.state_dict(), save_path)


  batch_labels = torch.tensor(batch_labels, device=device).long()


Epoch: 1/10, Loss: 1.6458550650849302, Train Accuracy: 0.15526576679630902


In [None]:
# model.load_state_dict(torch.load("model_weights.pth"))
# print("Trained model with class weighing:")

batch_size = 16

# Validating
with torch.no_grad():
    val_accuracy = 0.0
    val_total_samples = 0

    for i in range(0, val_features.shape[0], batch_size):
        batch_features = val_features[i:i + batch_size]
        batch_labels = val_labels[i:i + batch_size]

        batch_features_tensor = torch.tensor(batch_features.toarray(), device=device).long()
        batch_labels_tensor = torch.tensor(batch_labels, device=device).long()

        val_outputs = model(batch_features_tensor)
        _, val_predicted = torch.max(val_outputs, 1)

        # Compute batch accuracy
        val_batch_accuracy = (val_predicted == batch_labels_tensor).sum().item()
        val_accuracy += val_batch_accuracy
        val_total_samples += batch_labels_tensor.size(0)

    val_accuracy /= val_total_samples

print(f'Validation Accuracy: {val_accuracy}')


  batch_labels_tensor = torch.tensor(batch_labels, device=device).long()


KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load("model_weights.pth"))

# generating predictions on test data
with torch.no_grad():
    test_predicted = []
    for i in range(0, test_features.shape[0], batch_size):
        batch_features = test_features[i:i + batch_size]
        batch_features_tensor = torch.tensor(batch_features.toarray(), device=device).long()
        test_outputs = model(batch_features_tensor)
        _, batch_predicted = torch.max(test_outputs, 1)

        test_predicted.extend(batch_predicted.cpu().numpy())

with open("poniedzialek_grunwald_rozkosz.csv", "a") as results_file:
    for prediction in test_predicted:
        results_file.write(str(prediction.item()) + "\n")
