In [None]:
import os
import urllib.request


__all__ = ["download"]


def download_files(bucket_path, file_list, download_path):
    """
    Provide path to s3 bucket, download a file list to download path
    """
    if not os.path.isdir(download_path):
        os.makedirs(download_path)
    for f in file_list:
        # check if file already exists
        file_path = os.path.join(download_path, f)
        if os.path.isfile(file_path):
            print ('File "%s" already exists' %f)
        else:
            print ('Downloading "%s" ...' % f)
            urllib.request.urlretrieve(bucket_path + f, file_path)
            print ('Done')


def download(file_list=[]):
    """
    Downloads files from AWS S3 repository
    Here are all avialble dataset from the repository
    file_list=["yelp_academic_dataset_business.pickle"
               "yelp_academic_dataset_review.pickle",
               "yelp_academic_dataset_user.pickle",
               "yelp_academic_dataset_checkin.pickle",
               "yelp_academic_dataset_tip.pickle"]
    """
    if file_list == []:
        print ("Providiing empty file_list, no download...")
    else:
        bucket_path = "https://s3-us-west-2.amazonaws.com/science-of-science-bucket/yelp_academic_dataset/"
        current_path = os.path.dirname(os.path.abspath("/"))
        download_path = "/Users/UID/BIGDATA/Yelp Analysis"
        download_files(bucket_path, file_list, download_path)


In [None]:
download(file_list=["yelp_academic_dataset_business.pickle",
                              "yelp_academic_dataset_review.pickle",
                              "yelp_academic_dataset_user.pickle",
                              "yelp_academic_dataset_checkin.pickle",
                              "yelp_academic_dataset_tip.pickle"])

In [None]:
import pandas as pd
review = pd.read_pickle('/Users/UID/BIGDATA/Yelp Analysis/yelp_academic_dataset_business.pickle')
out = review.to_json(orient='records')[1:-1].replace('},{', '} {')

In [None]:
data = []
for i in review.index:
    d = {}
    for j in review.columns:
        d[j] = review[j][i]
    data.append(d)
len(data)

In [None]:
review.columns

In [None]:
from elasticsearch import Elasticsearch 
es = Elasticsearch()

from tqdm import tqdm 
import json

tables = ['business','checkin','photo','review','tip','user']
for table in tqdm(tables):
    data = []
    _ = es.indices.create(index='yelp_'+table , ignore=400)
    with open('/Users/UID/BIGDATA/Yelp Analysis/yelp_academic_dataset_business.pickle'.format(table), encoding='utf-8') as f:
        i = 0
        for line in f:
            result=es.create(index='yelp_'+table,doc_type=table,id=i,body=json.loads(line))
            i += 1
            if i % 5000 == 0:
                print(i, 'records have been uploaded')
    print(table, 'data finished!!!')