In [135]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

In [136]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [137]:
# loading train data
data = pd.read_csv("/kaggle/input/hotelsreviews/train_data.csv")

# dividing data into review and rating
reviews = data['review'].values
ratings = data['rating'].values

# dividing data into train and validation set
train_reviews, val_reviews, train_ratings, val_ratings = train_test_split(reviews, ratings, test_size=0.2)

In [138]:
# loading test data
test_data = pd.read_csv("/kaggle/input/hotelsreviews/test_data.csv")

test_reviews = test_data['review'].values

In [139]:
# prepering Bag of Words reprezentation
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_reviews)
# print(train_features)
val_features = vectorizer.transform(val_reviews)
# print(val_features)
test_features = vectorizer.transform(test_reviews)

# Przygotowanie reprezentacji TF-IDF
# vectorizer = TfidfVectorizer()
# train_features = vectorizer.fit_transform(train_reviews)
# val_features = vectorizer.transform(test_reviews)

In [140]:
# classes conversion to pytorch tensors
train_labels = torch.tensor(train_ratings - 1 + 1, device=device)
val_labels = torch.tensor(val_ratings - 1 + 1, device=device)

In [141]:
# calculating weight for each class
class_counts = data['rating'].value_counts().sort_index().values
print(class_counts)
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float, device=device)
print(class_weights)

[1137 1434 1747 4831 7243]
tensor([0.0009, 0.0007, 0.0006, 0.0002, 0.0001], device='cuda:0')


In [142]:
class HotelClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(HotelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 2*hidden_size)
        self.fc3 = nn.Linear(2*hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc4 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)
        return out

In [143]:
# gorsze wyniki, praktycznie ciagle najczestsza klase wybiera

# class HotelClassifierLSTM(nn.Module):
#     def __init__(self, input_size, hidden_size, num_classes):
#         super(HotelClassifier, self).__init__()
#         self.embedding = nn.Embedding(input_size, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
#         self.fc = nn.Linear(hidden_size, num_classes)

#     def forward(self, x):
#         embedded = self.embedding(x)
#         output, _ = self.lstm(embedded)
#         out = self.fc(output[:, -1, :])
#         return out

In [144]:
# model parameters
input_size = train_features.shape[1]
hidden_size = 128
num_classes = 5

model = HotelClassifier(input_size, hidden_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [145]:
# training
num_epochs = 20
batch_size = 64

for epoch in range(num_epochs):
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for i in range(0, train_features.shape[0], batch_size):
        batch_features = train_features[i:i + batch_size]
        batch_labels = train_labels[i:i + batch_size]

        batch_features = torch.tensor(batch_features.toarray(), device=device).float()
        batch_labels = torch.tensor(batch_labels, device=device).long()

        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        # print(predicted)
        total_correct += (predicted == batch_labels).sum().item()
        total_samples += batch_labels.size(0)

    train_loss = total_loss / (train_features.shape[0] // batch_size)
    train_accuracy = total_correct / total_samples

    print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {train_loss}, Train Accuracy: {train_accuracy}')

save_path = "model_weights.pth"
torch.save(model.state_dict(), save_path)


  batch_labels = torch.tensor(batch_labels, device=device).long()


Epoch: 1/20, Loss: 1.1352931605834586, Train Accuracy: 0.5165103332570732
Epoch: 2/20, Loss: 0.6315300993475259, Train Accuracy: 0.723938076717761
Epoch: 3/20, Loss: 0.35589644911826823, Train Accuracy: 0.8398535804163807
Epoch: 4/20, Loss: 0.24450972227050977, Train Accuracy: 0.8945321436742164
Epoch: 5/20, Loss: 0.12269740929717526, Train Accuracy: 0.9400594829558453
Epoch: 6/20, Loss: 0.06593757367455493, Train Accuracy: 0.9656829100892245
Epoch: 7/20, Loss: 0.03082298278562067, Train Accuracy: 0.9826889346450087
Epoch: 8/20, Loss: 0.01169166650580854, Train Accuracy: 0.9930603218180432
Epoch: 9/20, Loss: 0.003007093564020561, Train Accuracy: 0.9986273164035689
Epoch: 10/20, Loss: 0.0007916843515333455, Train Accuracy: 0.9999237398001983
Epoch: 11/20, Loss: 0.00045869814930840364, Train Accuracy: 0.9999237398001983
Epoch: 12/20, Loss: 0.00027578079844519667, Train Accuracy: 0.9999237398001983
Epoch: 13/20, Loss: 0.0001385213725718452, Train Accuracy: 1.0
Epoch: 14/20, Loss: 9.858544

In [146]:
# model.load_state_dict(torch.load("model_weights.pth"))
# print("Trained model with class weighing:")

batch_size = 16

# Validating
with torch.no_grad():
    val_accuracy = 0.0
    val_total_samples = 0

    for i in range(0, val_features.shape[0], batch_size):
        batch_features = val_features[i:i + batch_size]
        batch_labels = val_labels[i:i + batch_size]

        batch_features_tensor = torch.tensor(batch_features.toarray(), device=device).float()
        batch_labels_tensor = torch.tensor(batch_labels, device=device).long()

        val_outputs = model(batch_features_tensor)
        _, val_predicted = torch.max(val_outputs, 1)

        # Compute batch accuracy
        val_batch_accuracy = (val_predicted == batch_labels_tensor).sum().item()
        val_accuracy += val_batch_accuracy
        val_total_samples += batch_labels_tensor.size(0)

    val_accuracy /= val_total_samples

print(f'Validation Accuracy: {val_accuracy}')


  batch_labels_tensor = torch.tensor(batch_labels, device=device).long()


Validation Accuracy: 0.5834095760902714


In [147]:
model.load_state_dict(torch.load("/kaggle/working/model_weights.pth"))

# generating predictions on test data
with torch.no_grad():
    test_predicted = []
    for i in range(0, test_features.shape[0], batch_size):
        batch_features = test_features[i:i + batch_size]
        batch_features_tensor = torch.tensor(batch_features.toarray(), device=device).float()
        test_outputs = model(batch_features_tensor)
        _, batch_predicted = torch.max(test_outputs, 1)

        test_predicted.extend(batch_predicted.cpu().numpy())

with open("poniedzialek_grunwald_rozkosz.csv", "a") as results_file:
    for prediction in test_predicted:
        results_file.write(str(prediction.item()) + "\n")
