In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

tqdm.pandas()

In [2]:
with open('amazon-meta.txt', encoding='utf') as f:
    contents = f.readlines()

In [41]:
products = []
reviews = []
co_purchased = []

i = 3

while i < len(contents):
    p_info = {}
    while True:
        if i >= len(contents):
            break
        if contents[i] == '\n':
            i += 1
            break
        if contents[i] == '  discontinued product\n':
            i += 2
            break
        if contents[i][:4] == 'ASIN':
            p_info['product_id'] = contents[i][6:-1]
        elif contents[i][:7] == '  title':
            p_info['title'] = contents[i][9:-1]
        elif contents[i][:7] == '  group':
            p_info['group'] = contents[i][9:-1]
        elif contents[i][:11] == '  salesrank':
            p_info['salesrank'] = int(contents[i][13:-1])
        elif contents[i][:9] == '  similar':
            p_info['co_purchased_num'] = int(contents[i].split(': ')[-1][0])
            for co_purchased_item in contents[i][14:-1].split('  '):
                co_purchased.append({'first': p_info['product_id'], 'second': co_purchased_item})
        elif contents[i][:12] == '  categories':
            num = int(contents[i][:-1].split(': ')[-1])
            categories = set()
            for _ in range(num):
                i += 1
                for categ in contents[i][:-1].strip().split('|')[1:]:
                    categories.add(categ.split('[')[0])
            p_info['categories'] = list(categories)
        elif contents[i][:9] == '  reviews':
            num = int(contents[i][:-1].split(': ')[3].split(' ')[0])
            p_info['reviews_num'] = num
            p_info['avg_rate'] = float(contents[i][:-1].split(': ')[-1])
            
            if num != 0:
                p_info['first_rating_time'] = (datetime.date(*map(int, contents[i+1].strip().split(':')[0].strip().split(' ')[0].split('-')))
                                    - datetime.date(1970, 1, 1)).days
                p_info['last_rating_time'] = (datetime.date(*map(int, contents[i+num].strip().split(':')[0].strip().split(' ')[0].split('-')))
                                    - datetime.date(1970, 1, 1)).days
                lowest_rate = 5
                highest_rate = 0
                for index in range(num):
                    info = contents[i+index+1].strip().split(':')
                    r_info = {}
                    r_info['product_id'] = p_info['product_id']
                    r_info['customer_id'] = info[1].strip().split(' ')[0]
                    r_info['votes_num'] = int(info[3].strip().split(' ')[0])
                    r_info['helpful_num'] = int(info[4].strip().split(' ')[0])
                    r_info['time'] = (datetime.date(*map(int, info[0].strip().split(' ')[0].split('-')))
                                        - datetime.date(1970, 1, 1)).days
                    rating = float(info[2].strip().split(' ')[0])
                    r_info['rating'] = rating
                    if rating > highest_rate:
                        highest_rate = rating
                    if rating < lowest_rate:
                        lowest_rate = rating
                    reviews.append(r_info)
                p_info['highest_rate'] = highest_rate
                p_info['lowest_rate'] = lowest_rate
            
            i += num
        i += 1

    products.append(p_info)

In [42]:
products_df = pd.json_normalize(products).dropna().reset_index(drop=True)
reviews_df = pd.json_normalize(reviews).dropna().reset_index(drop=True)
co_purchased_df = pd.json_normalize(co_purchased).dropna().reset_index(drop=True)

In [43]:
products_df.to_feather('./Data/products.ftr')
reviews_df.to_feather('./Data/reviews.ftr')
co_purchased_df.to_feather('./Data/co_purchased.ftr')

In [49]:
temp = products_df.copy()
temp['1'] = 1
test = temp[['product_id', '1']].sample(n=4000)
gene_data = pd.merge(test[:2000], test[2000:], left_on = '1', right_on = '1', how = 'outer')
gene_data.drop(columns=['1'], inplace=True)
gene_data.columns = ['first', 'second']
temp.drop(columns=['1'], inplace=True)
gene_data.head()

Unnamed: 0,first,second
0,1563973820,B00004TFK3
1,1563973820,1878878530
2,1563973820,0873377818
3,1563973820,156163350X
4,1563973820,0849915546


In [50]:
has = co_purchased_df.copy()
has['pair'] = has.apply(lambda x: sorted([x['first'], x['second']]), 1)
has['pair'] = has['pair'].apply(lambda x: x[0]+x[1])
gene_data['pair'] = gene_data.apply(lambda x: sorted([x['first'], x['second']]), 1)
gene_data['pair'] = gene_data['pair'].apply(lambda x: x[0]+x[1])
gene_data = gene_data[~gene_data['pair'].isin(has['pair'])].drop(columns=['pair']).reset_index(drop=True)

In [51]:
temp1 = products_df.set_index('product_id').reindex(index=co_purchased_df['first'])
temp1.columns = [col+'1' for col in products_df.columns[1:]]
temp2 = products_df.set_index('product_id').reindex(index=co_purchased_df['second'])
temp2.columns = [col+'2' for col in products_df.columns[1:]]
dataset1 = temp1.reset_index(drop=True).join(temp2.reset_index(drop=True)).dropna().reset_index(drop=True)

temp1 = products_df.set_index('product_id').reindex(index=gene_data['first'])
temp1.columns = [col+'1' for col in products_df.columns[1:]]
temp2 = products_df.set_index('product_id').reindex(index=gene_data['second'])
temp2.columns = [col+'2' for col in products_df.columns[1:]]
dataset2 = temp1.reset_index(drop=True).join(temp2.reset_index(drop=True)).dropna().reset_index(drop=True)

In [52]:
all_pair = pd.concat([dataset1, dataset2]).reset_index(drop=True)[:2*len(dataset1)]
all_pair['label'] = [1] * len(dataset1) + [0] * len(dataset1)
all_pair.head()

Unnamed: 0,title1,group1,salesrank1,co_purchased_num1,categories1,reviews_num1,avg_rate1,first_rating_time1,last_rating_time1,highest_rate1,...,salesrank2,co_purchased_num2,categories2,reviews_num2,avg_rate2,first_rating_time2,last_rating_time2,highest_rate2,lowest_rate2,label
0,Patterns of Preaching: A Sermon Sampler,Book,396585.0,5.0,"[Books, Subjects, Preaching, Clergy, Religion ...",2.0,5.0,11166.0,12400.0,5.0,...,93405.0,5.0,"[Books, Subjects, Preaching, Clergy, Religion ...",3.0,4.5,10924.0,12565.0,5.0,4.0,1
1,Patterns of Preaching: A Sermon Sampler,Book,396585.0,5.0,"[Books, Subjects, Preaching, Clergy, Religion ...",2.0,5.0,11166.0,12400.0,5.0,...,92111.0,5.0,"[Books, Subjects, Taylor, Barbara Brown, Preac...",14.0,4.5,10564.0,12922.0,5.0,4.0,1
2,Patterns of Preaching: A Sermon Sampler,Book,396585.0,5.0,"[Books, Subjects, Preaching, Clergy, Religion ...",2.0,5.0,11166.0,12400.0,5.0,...,64877.0,5.0,"[Books, Subjects, Preaching, Clergy, Religion ...",4.0,4.5,10963.0,11980.0,5.0,3.0,1
3,Patterns of Preaching: A Sermon Sampler,Book,396585.0,5.0,"[Books, Subjects, Preaching, Clergy, Religion ...",2.0,5.0,11166.0,12400.0,5.0,...,710700.0,5.0,"[Books, Subjects, Preaching, Clergy, Religion ...",3.0,3.0,11243.0,12164.0,5.0,1.0,1
4,Candlemas: Feast of Flames,Book,168596.0,5.0,"[Books, Subjects, Wicca, Witchcraft, Religion ...",12.0,4.5,11672.0,12821.0,5.0,...,311254.0,5.0,"[Books, Subjects, Paganism, New Age, Wicca, Ge...",10.0,4.0,11704.0,12848.0,5.0,2.0,1


In [None]:
all_pair.to_feather('./Data/dataset.ftr')