In [1]:
import pandas as pd
import gzip
import json
import numpy as np
import os
import re
import csv

# NUM_SAMPLES = 1000
ds_path = "data/raw/Electronics_5.json"
nb_files = 20

In [2]:
# Load Data from json-file to list
raw_data = []
with open(ds_path) as f:
    for line in f:
        raw_data.append(json.loads(line))

print(len(raw_data))

1689188


In [3]:
# convert data from list to pandas dataframe
df = pd.DataFrame(raw_data)

# filter all review texts that have more then 30 characters
df = df[df["reviewText"].str.len() >= 30]

# convert overall rating to sentiment
df.insert(3, "sentiment", df["overall"].replace({5.0: 1, 4.0: 1, 3.0: 0, 2.0: -1, 1.0: -1}), allow_duplicates=True)

# compute minimum number of occurences of all sentiments
sent_count_min = df["sentiment"].value_counts().min()
df = df.groupby("sentiment").head(sent_count_min)

# shuffle data (random_state for reproducibility)
df = df.sample(frac=1, random_state=1).reset_index(drop=True)

print("Total reviews: {}".format(len(df)))
print(df["overall"].value_counts())

df.head()

Total reviews: 425685
3.0    141895
5.0    106117
1.0     81941
2.0     59954
4.0     35778
Name: overall, dtype: int64


Unnamed: 0,asin,helpful,overall,sentiment,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000BNHM0C,"[1, 2]",5.0,1.0,"I couldn't be happier with this piece of art, ...","03 11, 2011",A2JF49W7LTHOL6,Enrique,"Couldn't be happier, excellent product!",1299801600
1,B007KEZMX4,"[0, 0]",3.0,0.0,The quality of the picture I'd good. But what ...,"06 3, 2013",A6J8E6LA310ZE,N Z,freezes too much,1370217600
2,B0006MWPSM,"[0, 0]",5.0,1.0,Purchased for our conference room to connect a...,"04 18, 2012",A1OHOFON2HRKID,"D. Asher ""Dave""","Good product, Great price",1334707200
3,B000JET9EA,"[0, 1]",1.0,-1.0,I don't understand how they could have made su...,"09 21, 2011",A1K4G5YJDJQI6Q,Steve C,Worthless for anything but playing MP3's!,1316563200
4,B0028ACYEK,"[0, 0]",3.0,0.0,if you have a three story home then it would n...,"05 4, 2014",A12N3BWTRHTMAC,"50rocks ""50rocks""",broke after a couple of years,1399161600


In [5]:
print("Creating .txt file that contains {} reviews: {}".format(rev_texts_path, len(df)))
with open("data/processed/final/electronics_sentences.txt", "w") as f:
    for i, row in df.iterrows():
        # f.write("review_{}\n".format(i))
        f.write("###{}\n".format(row["overall"]))
        f.write(row["reviewText"] + "\n\n")
        

Creating .txt file that contains data/processed/electronics/electronics_reviews reviews: 409825


In [6]:
print("Creating {} documents that contains {} reviews each: {}".format(nb_files, int(len(df)/nb_files) ,rev_texts_path))

reviews_per_file = int(len(df)/nb_files)
file_counter = 0
reviews = ""
review_counter = 0

for i, row in df.iterrows():

    reviews += "###{}\n{}\n\n".format(row["overall"], row["reviewText"])
    review_counter += 1

    if review_counter == reviews_per_file:
        with open(rev_texts_path + str(file_counter + 1) + ".txt", "w") as f:
            f.write(reviews)
            
        reviews = ""
        file_counter += 1
        review_counter = 0

with open(rev_texts_path + str(file_counter) + ".txt", "a") as f:
    f.write(reviews)            



Creating 20 documents that contains 20491 reviews each: data/processed/electronics/electronics_reviews


***

## Postprocessing after EDU Segmantation

In [None]:
'''
Create txt-file and csv-file containing the processed reviews of all sub-files
'''

reviews = ""

for i in range(nb_files):
    with open("data/processed/electronics_edus/electronics_reviews{}.txt.edus".format(i+1), "r") as f: 
        reviews += f.read()

reviews = re.sub("(-LRB-|-RRB-|-LSB-|-RSB-)", "", reviews)

for i in range(5):
    reviews = re.sub('### {}.0 (EDU_BREAK )?'.format(i+1), '\n###{}.0\n'.format(i+1), reviews)
reviews = reviews[1:]

# txt
with open("data/processed/final/electronics_edus.txt", "w") as f: 
    f.write(reviews)


reviews = reviews.split("###")[1:]

# csv
with open("data/processed/final/electronics_edus.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["overall", "reviewText"])
    
    for review in reviews:
        rating = review[0:3]
        review_text = review[3:]     
        writer.writerow([rating, review_text])

In [14]:
print(reviews[:1000])

###5.0
Its a splitter EDU_BREAK that allows me to not be tortured by my children fighting over a toy .
Anything EDU_BREAK that makes my life easier EDU_BREAK is a win-win !
Its cheap and I will never travel without it !

###1.0
I was wanting to use these cables to connect my Sprint EVO phone to the computer USB port , or charge EDU_BREAK using the USB power adapter .
These cables wo n't stay connected .
Physically , they stayed EDU_BREAK plugged in , EDU_BREAK but the computer would n't see the EVO , EDU_BREAK and the EVO 's charge indicator EDU_BREAK  which starts EDU_BREAK charging EDU_BREAK when connected to a USB port  EDU_BREAK kept EDU_BREAK turning off , EDU_BREAK indicating no power was being detected .
Tried multiple different PC 's and USB power adapters .
My other cables work fine , EDU_BREAK but the 2 of these cables EDU_BREAK I ordered failed .
Too expensive shipping to return , for the small amount of net refund I 'd get for the purchase price .
Guess I 'll learn not to b