In [3]:
!pip install simplejson

You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [4]:
import collections
import csv
import json

def preprocess_line(line):
    # Escape line breaks within string values
    return line.replace("\n", "\\n")

def read_and_write_file(json_file_path, csv_file_path, column_names):
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as fout:
        csv_file = csv.writer(fout)
        csv_file.writerow(list(column_names))
        with open(json_file_path, encoding='utf-8') as fin:
            for line in fin:
                try:
                    preprocessed_line = preprocess_line(line)
                    line_contents = json.loads(preprocessed_line)
                    csv_file.writerow(get_row(line_contents, column_names))
                except json.JSONDecodeError as e:
                    print(f"Error parsing JSON: {e} - Line: {line}")
                    continue  # Skip lines that cause errors


def get_superset_of_column_names_from_file(json_file_path):
    column_names = set()
    with open(json_file_path, encoding='utf-8') as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(set(get_column_names(line_contents).keys()))
    return column_names

def get_column_names(line_contents, parent_key=''):
    column_names = []
    for k, v in line_contents.items():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.abc.MutableMapping):
            column_names.extend(get_column_names(v, column_name).items())
        else:
            column_names.append((column_name, v))
    return dict(column_names)

def get_nested_value(d, key):
    if '.' not in key:
        if d is None or key not in d:
            return None
        return d[key]
    base_key, sub_key = key.split('.', 1)
    if d is None or base_key not in d or d[base_key] is None:
        return None
    sub_dict = d[base_key]
    return get_nested_value(sub_dict, sub_key)


def get_row(line_contents, column_names):
    row = []
    for column_name in column_names:
        line_value = get_nested_value(line_contents, column_name)
        if line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
    return row

# 파일 변환 실행
# dataset_list = ["business", "checkin", "tip", "user"]
dataset_list = ["review"]
for i in dataset_list:
    json_file = f'data/yelp_academic_dataset_{i}.json'
    csv_file = '{0}.csv'.format(json_file.split('.json')[0])
    column_names = get_superset_of_column_names_from_file(json_file)
    read_and_write_file(json_file, csv_file, column_names)
    print(f"convert json to csv [{i}]")


TypeError: expected str, bytes or os.PathLike object, not JsonReader

In [5]:
import pandas as pd

csv_file_path = 'data/yelp_academic_dataset_business.csv'
business = pd.read_csv(csv_file_path)
business.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,attributes.OutdoorSeating,attributes.AcceptsInsurance,attributes.HairSpecializesIn,attributes.RestaurantsTableService,attributes.HappyHour,attributes.RestaurantsCounterService,attributes.BusinessAcceptsBitcoin,attributes.RestaurantsDelivery,hours.Wednesday,attributes,...,attributes.WheelchairAccessible,city,attributes.BestNights,attributes.Alcohol,name,attributes.RestaurantsReservations,longitude,categories,hours,attributes.WiFi
0,,,,,,,,,,{'ByAppointmentOnly': 'True'},...,,Santa Barbara,,,"Abby Rappoport, LAC, CMQ",,-119.711197,"Doctors, Traditional Chinese Medicine, Naturop...",,
1,,,,,,,,,8:0-18:30,{'BusinessAcceptsCreditCards': 'True'},...,,Affton,,,The UPS Store,,-90.335695,"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...",
2,False,,,,False,,,False,8:0-22:0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...",...,True,Tucson,,,Target,False,-110.880452,"Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...",u'no'
3,False,,,,,,,False,7:0-20:0,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...",...,,Philadelphia,,u'none',St Honore Pastries,,-75.155564,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",u'free'
4,,,,,,,,,14:0-22:0,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...",...,True,Green Lane,,,Perkiomen Valley Brewery,,-75.471659,"Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...",


In [6]:
required_columns = [
    'business_id', 'name', 'address', 'city', 'state',
    'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories', 'hours'
]

business = business[required_columns]
business.head()

Unnamed: 0,business_id,name,address,city,state,latitude,longitude,stars,review_count,is_open,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,34.426679,-119.711197,5.0,7,0,"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,38.551126,-90.335695,3.0,15,1,"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,32.223236,-110.880452,3.5,22,0,"Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,39.955505,-75.155564,4.0,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,40.338183,-75.471659,4.5,13,1,"Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [7]:
output_file_path = "data/business.csv"
business.to_csv(output_file_path, index=False)

In [8]:
csv_file_path = 'data/yelp_academic_dataset_checkin.csv'
checkin = pd.read_csv(csv_file_path)
checkin.head()

Unnamed: 0,date,business_id
0,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020...",---kPU91CF4Lq2-WlRu9Lw
1,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011...",--0iUa4sNDFiZFrAdIWhZQ
2,"2013-06-14 23:29:17, 2014-08-13 23:20:22",--30_8IhuyMHbSOcNWd6DQ
3,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012...",--7PUidqRWpRSpXebiyxTg
4,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014...",--7jw19RH9JKXgFohspgQw


In [None]:
output_file_path = "data/checkin.csv"
checkin.to_csv(output_file_path, index=False)

In [None]:
csv_file_path = 'data/yelp_academic_dataset_tip.csv'
tip = pd.read_csv(csv_file_path)
tip.head()

In [None]:
output_file_path = "data/tip.csv"
tip.to_csv(output_file_path, index=False)

In [None]:
import pandas as pd

csv_file_path = 'data/yelp_academic_dataset_user.csv'
user = pd.read_csv(csv_file_path)
user.head()

In [None]:
output_file_path = "data/user.csv"
user.to_csv(output_file_path, index=False)

In [None]:
import pandas as pd

csv_file_path = 'data/yelp_academic_dataset_review.json'
# review = pd.read_csv(csv_file_path)
review = pd.read_json(csv_file_path)
review.head()

In [None]:
output_file_path = "data/review.csv"
review.to_csv(output_file_path, index=False)

In [None]:
import os
import boto3
import sagemaker
import numpy as np
import pandas as pd

In [None]:
session = boto3.session.Session()
region = session.region_name
print(f'currently in {region}')