In [1]:
import gzip, json
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

data = []

for review in parse("Software.json.gz"):
  data.append(review)

In [37]:
import numpy as np
from sklearn.model_selection import train_test_split

indices = np.arange(len(data))
indices_train, indices_test = train_test_split(indices, test_size=0.1, random_state=42)
indices_train, indices_val = train_test_split(indices_train, test_size=0.1/0.9, random_state=42)

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# fit the ID encoder on full data set
userID_str = np.array([[d['reviewerID']] for d in data])
userID_encoder = LabelEncoder()
userID_encoder.fit(userID_str.squeeze())
userID_one_hot_encoder = OneHotEncoder()
userID_one_hot_encoder.fit(userID_str)

itemID_str = np.array([[d['asin']] for d in data])
itemID_encoder = LabelEncoder()
itemID_encoder.fit(itemID_str.squeeze())
itemID_one_hot_encoder = OneHotEncoder()
itemID_one_hot_encoder.fit(itemID_str)

In [None]:
# import pickle
# encoders = {
#     "userID_encoder": userID_encoder,
#     "userID_one_hot_encoder": userID_one_hot_encoder,
#     "itemID_encoder": itemID_encoder,
#     "itemID_one_hot_encoder": itemID_one_hot_encoder
# }
# with open('encoder.pkl', 'wb') as file:
#     pickle.dump(encoders, file)
# with open('encoder.pkl', 'rb') as file:
#     encoders = pickle.load(file)

In [66]:
userID = userID_encoder.transform([d["reviewerID"] for d in data])
userID_onehot = userID_one_hot_encoder.transform([[d["reviewerID"]] for d in data])
itemID = itemID_encoder.transform([d["asin"] for d in data])
itemID_onehot = itemID_one_hot_encoder.transform([[d["asin"]] for d in data])

In [67]:
verified = np.array([d['verified'] for d in data])
review_length = np.array([len(d['reviewText']) if 'reviewText' in d else 0 for d in data])
rating = np.array([d['overall'] for d in data], dtype=np.int8)

In [68]:
import datetime
date = [
    datetime.datetime.fromtimestamp(d['unixReviewTime'])
    for d in data
]
min_date = min(date)

In [69]:
days = np.array([(d-min_date).days for d in date])
weekdays = np.array([d.weekday() for d in date])
month = np.array([d.month-1 for d in date])

In [70]:
np.savez_compressed('non-text-feature.npz', 
    indices_train=indices_train,indices_val=indices_val,indices_test=indices_test,
    userID=userID, userID_onehot=userID_onehot,
    itemID=itemID, itemID_onehot=itemID_onehot,
    verified=verified,
    review_length=review_length,
    rating=rating,
    days=days,
    weekdays=weekdays,
    month=month
)