In [100]:
#Download Data

# !wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/All_Beauty_5.json.gz
# !wget http://deepyeti.ucsd.edu/jianmo/amazon/sample/meta_Computers.json.gz
# !wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/AMAZON_FASHION_5.json.gz
# !wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Appliances_5.json.gz
# !wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Software_5.json.gz
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Appliances_5.json.gz

--2022-06-11 19:33:35--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Appliances_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 72998 (71K) [application/octet-stream]
Saving to: ‘Appliances_5.json.gz’


2022-06-11 19:33:36 (1.78 MB/s) - ‘Appliances_5.json.gz’ saved [72998/72998]



In [95]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import spacy

In [96]:
class Dataset:
    def __init__(self):
        # self.FILE = 'data/Appliances_5.json.gz'
        self.FILE_NAME = 'Software_5'
        self.MIN_REVIEW_LENGTH = 50
        self.MIN_KEYWORDS = 3
    
    def build_data(self):
        self.get_data()
        self.clean_data()
        self.df_to_csv(self.df, 'data/review_'+self.FILE_NAME+'.csv')
        # self.df_to_csv(self.df, 'data/review_Appliances_5.csv')
    
    def get_data(self):
        self.data = []
        with gzip.open('data/'+self.FILE_NAME+'.json.gz') as f:
        # with gzip.open('data/Appliances_5.json.gz') as f:
            for l in f:
                self.data.append(json.loads(l.strip()))

        print(len(self.data))
        print(self.data[0])
        
    def clean_data(self):
        print(len(self.data))
        reviews = {}
        reviews['reviewText'] = []
        for row in self.data:
            if 'reviewText' in row and len(row['reviewText']) > self.MIN_REVIEW_LENGTH:
                reviews['reviewText'].append(row['reviewText'])

        self.df = pd.DataFrame(reviews)
        self.df.drop_duplicates(keep=False, inplace=False)
        print(self.df)
    
    def df_to_csv(self, df, csv):
        df.to_csv(csv, index=False, encoding='utf-8')
    
    def csv_to_df(self, csv):
        return pd.read_csv(csv, header=None, usecols=[0])
        
    def get_keyword_tokens(self):
        nlp = spacy.load('en_core_web_sm')
        df = self.csv_to_df('data/review_'+self.FILE_NAME+'.csv')
        # df = self.csv_to_df("data/review_Appliances_5.csv")
        v = df.to_numpy()
        v = list(set(v.flatten()))     
        
        reviews = {'input_text':[], 'target_text':[]}
        for i in range(len(v)):
            sentence = v[i]
            doc = nlp(sentence)

            keywords = []
            for token in doc:
                if (not token.is_stop and token.is_alpha) and (token.tag_ == 'NN' or token.tag_ == 'VBG'):
                    keywords.append(token.lemma_.lower())

            if len(keywords) > self.MIN_KEYWORDS:
                keywords = pd.Series(keywords).drop_duplicates().tolist()
                keywords = ' | '.join(keywords)
                reviews['input_text'].append(keywords.strip())
                reviews['target_text'].append(sentence.strip())

        reviews_list = pd.DataFrame(reviews)
        print(reviews_list)
        
        df = reviews_list.copy()
        print(df)
        
        df.to_csv('data/review_keywords_'+self.FILE_NAME+'.csv', columns=['input_text', 'target_text'], index=False)
        # df.to_csv('data/review_keywords_Appliances_5.csv', columns=['input_text', 'target_text'], index=False)

In [97]:
dataset = Dataset()

In [98]:
dataset.build_data()

12805
{'overall': 4.0, 'verified': False, 'reviewTime': '10 20, 2010', 'reviewerID': 'A38NELQT98S4H8', 'asin': '0321719816', 'style': {'Format:': ' DVD-ROM'}, 'reviewerName': 'WB Halper', 'reviewText': "I've been using Dreamweaver (and it's predecessor Macromedia's UltraDev) for many years.  For someone who is an experienced web designer, this course is a high-level review of the CS5 version of Dreamweaver, but it doesn't go into a great enough level of detail to find it very useful.\n\nOn the other hand, this is a great tool for someone who is a relative novice at web design.  It starts off with a basic overview of HTML and continues through the concepts necessary to build a modern web site.  Someone who goes through this course should exit with enough knowledge to create something that does what you want it do do...within reason.  Don't expect to go off and build an entire e-commerce system with only this class under your belt.\n\nIt's important to note that there's a long gap from s

In [99]:
dataset.get_keyword_tokens()

                                             input_text  \
0     version | software | pay | middle | update | f...   
1     tech | support | fact | method | product | res...   
2     look | hood | information | engine | user | in...   
3     desktop | sound | video | editing | software |...   
4     experience | accounting | product | version | ...   
...                                                 ...   
8706  packaging | gps | detail | interest | software...   
8707             os | copy | run | non | reboot | money   
8708  compare | site | advisory | user | web | fear ...   
8709                     state | dvd | player | program   
8710  purchase | blu | ray | burner | software | cha...   

                                            target_text  
0     I just recently converted to this version from...  
1     If you have any problems you will not be able ...  
2     Because, while I'm not about to go looking und...  
3     Corel VideoStudio Ultimate X8 installed on my ...  
4