In [1]:
import matplotlib.pyplot as plt # for plotting
import numpy as np # for transformation
import pandas as pd
from PIL import Image
from os.path import exists

import torch # PyTorch package
import torchvision # load datasets
import torchvision.transforms as transforms # transform data
import torch.nn as nn # basic building block for neural neteorks
import torch.nn.functional as F # import convolution functions like Relu
import torch.optim as optim # optimzer

In [2]:
# Load the JSON data into a python dictionary
train_data = pd.read_json("train_data.json")

#print(train_data)

# Clean out the games that have no reviews
train_df = train_data.dropna(subset=["sentiment"])

# explode the dataset
train_df_expanded=train_df.explode("screenshots", ignore_index=True)


In [3]:
# Exploration and minor cleaning
print(train_data.columns)

print(train_data["sentiment"].value_counts())

dupe = train_data["appid"].value_counts()
if 2 in dupe.values:
    print("There is at least one value with a count of 2.")
else:
    print("No value has a count of 2.")

Index(['appid', 'release', 'title', 'price', 'sentiment', 'reviews',
       'percentage', 'tags', 'screenshots'],
      dtype='object')
sentiment
Positive                   1650
Very Positive              1148
Mixed                       736
Mostly Positive             664
Overwhelmingly Positive     109
Mostly Negative              89
Negative                      8
Very Negative                 1
Name: count, dtype: int64
No value has a count of 2.


In [4]:
# After this part it gets memory intensive, I have 16GB and it doesn't work so I used the external python script to make the images 256 256

In [5]:
# Initialize the transformation funciton
transform = transforms.Compose(
    [transforms.ToTensor(), # to tensor object
     transforms.Normalize((127.5, 127.5, 127.5), (127.5, 127.5, 127.5))
     ])

train_df_expanded["images"] = None
for i in range(len(train_df_expanded["screenshots"])//2):
    screenshot_filename = train_df_expanded["screenshots"][i]
    image_path = f"C:/Users/Beste/Desktop/AAB/images/{screenshot_filename}"
    if exists(image_path):
        with Image.open(image_path) as image:
            image = image.convert('RGB')
            transformed_image = transform(image)
            train_df_expanded.at[i, "images"] = transformed_image

# Clean out the games that have no image
train_df_expanded = train_df_expanded.dropna(subset=["images"])

In [6]:
# set batch_size
batch_size = 4

# set number of workers
num_workers = 2

# train data
trainloader = torch.utils.data.DataLoader(train_df_expanded, batch_size=batch_size,
                                          shuffle=True, num_workers=num_workers)

# put 10 classes into a set
classes = ('Overwhelmingly Positive', 'Very Positive', 'Positive', 'Mostly Positive',
           'Mixed', 'Mostly Negative', 'Negative', 'Very Negative', 'Ovewhelmingly Negative')

In [7]:
def imshow(img):
  ''' function to show image '''
  img = img / 2 + 0.5 # unnormalize
  npimg = img.numpy() # convert to numpy objects
  plt.imshow(np.transpose(npimg, (1, 2, 0)))
  plt.show()

# get random training images with iter function
dataiter = iter(trainloader)
images, labels = dataiter.next()

# call function on our images
imshow(torchvision.utils.make_grid(images))

# print the class of the image
print(' '.join('%s' % classes[labels[j]] for j in range(batch_size)))

KeyboardInterrupt: 