In [1]:
import pandas as pd
from scipy.sparse import csr_matrix, save_npz

# Import Dataset

In [2]:
path = "data/ToysGames/"

In [3]:
df = pd.read_csv(path + 'Data.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewText,overall,Binary
0,8,AFGGC7SIV964O,439893577,This is good product to have with unlimited po...,4.0,1
1,11,A5I03TZD4J96V,439893577,Bought this board along with Melissa and Doug ...,5.0,1
2,15,A137TYEMD68CAQ,439893577,We purchased this to go with the magnetic lett...,5.0,1
3,34,A17TMITCPJET4,615444172,This is a good sized book. Great quality. I b...,5.0,1
4,35,A2VJPY7A1DTF6Z,615444172,We haven't found a sticker that won't peel off...,5.0,1


# Label Users and Items

In [5]:
num_user = len(df.reviewerID.unique())
num_item = len(df.asin.unique())
print("The number of users is {}".format(num_user))
print("The number of items is {}".format(num_item))

The number of users is 4212
The number of items is 5158


In [6]:
user_df = pd.DataFrame.from_dict({"reviewerID": df.reviewerID.unique(), "user_id": range(num_user)})
user_df.head()

Unnamed: 0,reviewerID,user_id
0,AFGGC7SIV964O,0
1,A5I03TZD4J96V,1
2,A137TYEMD68CAQ,2
3,A17TMITCPJET4,3
4,A2VJPY7A1DTF6Z,4


In [7]:
item_df = pd.DataFrame.from_dict({"asin": df.asin.unique(), "item_id": range(num_item)})
item_df.head()

Unnamed: 0,asin,item_id
0,0439893577,0
1,0615444172,1
2,0735321396,2
3,0735333483,3
4,073533417X,4


In [8]:
df = pd.merge(df, user_df, on='reviewerID', how='left')
df = pd.merge(df, item_df, on='asin', how='left')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewText,overall,Binary,user_id,item_id
0,8,AFGGC7SIV964O,439893577,This is good product to have with unlimited po...,4.0,1,0,0
1,11,A5I03TZD4J96V,439893577,Bought this board along with Melissa and Doug ...,5.0,1,1,0
2,15,A137TYEMD68CAQ,439893577,We purchased this to go with the magnetic lett...,5.0,1,2,0
3,34,A17TMITCPJET4,615444172,This is a good sized book. Great quality. I b...,5.0,1,3,1
4,35,A2VJPY7A1DTF6Z,615444172,We haven't found a sticker that won't peel off...,5.0,1,4,1


In [9]:
df = df[["user_id", "item_id", "Binary"]]

# Only Contain Positive Feedbacks

In [10]:
df = df[df['Binary'] == 1]
print("The number of positive ratings is {}".format(len(df)))

The number of positive ratings is 16948


In [11]:
df.head()

Unnamed: 0,user_id,item_id,Binary
0,0,0,1
1,1,0,1
2,2,0,1
3,3,1,1
4,4,1,1


# Split Dataset

In [12]:
def leave_one_out_split(df, user_col, ratio):
    grouped = df.groupby(user_col, as_index=False)
    valid = grouped.apply(lambda x: x.sample(frac=ratio))
    train = df.loc[~df.index.isin([x[1] for x in valid.index])]
    return train, valid

In [13]:
df_train, df_valid = leave_one_out_split(df, 'user_id', 0.1)

In [14]:
df_train.shape

(16113, 3)

# Convert Dataset from Pandas to CSV

In [15]:
def pandas_to_csr(df, row_name='user_id', col_name='item_id', 
                  value_name='rating', shape=(138494, 131263)):
    rows = df[row_name]
    cols = df[col_name]
    if value_name is not None:
        values = df[value_name]
    else:
        values = [1]*len(rows)

    return csr_matrix((values, (rows, cols)), shape=shape)

In [16]:
train_set = pandas_to_csr(df_train, value_name=None, shape=(num_user, num_item))
valid_set = pandas_to_csr(df_valid, value_name=None, shape=(num_user, num_item))

In [17]:
save_npz('../NCE_Projected_LRec/datax/Amazon_Review/Rtrain.npz', train_set)
save_npz('../NCE_Projected_LRec/datax/Amazon_Review/Rvalid.npz', valid_set)