### Import libraries and datasets

In [69]:
import os
import subprocess
import glob
import shutil
import pandas as pd
from google.cloud import bigquery
from pandas_gbq import read_gbq
from datetime import datetime
from pytz import timezone

In [70]:
folder_path = os.getcwd().replace("\\", "/")
print(f"Folder path: {folder_path}")

Folder path: c:/Users/Michael/Documents/GitHub/appStoreAnalytics/dataSources


In [71]:
# Hard-coded variables
project_id = "placeHolder"
dataset = "practice_project"
apple_db_path = f"{project_id}.{dataset}.apple"
google_db_path = f"{project_id}.{dataset}.google"

client = bigquery.Client.from_service_account_json(f"{folder_path}/placeHolder.json")
apple_csv_path = f"{folder_path}/apple.csv"
google_csv_path = f"{folder_path}/google.csv"

In [4]:
# Apple
## Clone the repository
subprocess.run(["git", "clone", "https://github.com/gauthamp10/apple-appstore-apps.git"])
## Change directory to the dataset folder
os.chdir("apple-appstore-apps/dataset")
## Extract the tar.lzma file
subprocess.run(["tar", "-xvf", "appleAppData.json.tar.lzma"])
## Read into DataFrame
apple = pd.read_json("appleAppData.json")

# Google
## Clone the repository
subprocess.run(["git", "clone", "https://github.com/gauthamp10/Google-Playstore-Dataset.git"])
## Change directory to the dataset folder
os.chdir("Google-Playstore-Dataset/dataset")
## Extract all .tar.gz files
for f in os.listdir():
    if f.endswith(".tar.gz"):
        subprocess.run(["tar", "-xvf", f])
combined_csv = "Google-Playstore-Dataset.csv"
with open(combined_csv, "wb") as outfile:
    for csvfile in glob.glob("Part?.csv"):
        with open(csvfile, "rb") as infile:
            outfile.write(infile.read())
## Read into DataFrame
google = pd.read_csv("Google-Playstore-Dataset.csv", header = 0) # low_memory = False

Apple dataset: 1230376 rows, 21 columns
google dataset: 2312944 rows, 24 columns


### Push datasets into Google BigQuery

In [5]:
# Create tables into Google BigQuery

## Create 'apple' table in DB
job = client.query(f"DELETE FROM {apple_db_path} WHERE TRUE").result()
client.create_table(bigquery.Table(apple_db_path), exists_ok = True)

## Create 'google' table in DB
job = client.query(f"DELETE FROM {google_db_path} WHERE TRUE").result()
client.create_table(bigquery.Table(google_db_path), exists_ok = True)

Table(TableReference(DatasetReference('big-data-analytics-412816', 'practice_project'), 'google'))

In [6]:
# Save data as CSV files
apple.columns = [name.replace(" ", "_") for name in apple.columns]
apple.to_csv(apple_csv_path, header = True, index = False)
google.columns = [name.replace(" ", "_") for name in google.columns]
google.to_csv(google_csv_path, header = True, index = False)

In [7]:
%%time

# Push data into DB

apple_job_config = bigquery.LoadJobConfig(
    autodetect=True,
    max_bad_records=5,
    source_format=bigquery.SourceFormat.CSV
)
apple_config = client.dataset(dataset).table('apple')
with open(apple_csv_path, 'rb') as f:
    apple_load_job = client.load_table_from_file(f, apple_config, job_config=apple_job_config)
apple_load_job.result()

google_job_config = bigquery.LoadJobConfig(
    autodetect=False,
    skip_leading_rows=1,
    max_bad_records=5,
    source_format=bigquery.SourceFormat.CSV
)
google_config = client.dataset(dataset).table('google')
with open(google_csv_path, 'rb') as f:
    google_load_job = client.load_table_from_file(f, google_config, job_config=google_job_config)
google_load_job.result()

CPU times: total: 516 ms
Wall time: 6min 13s


LoadJob<project=big-data-analytics-412816, location=US, id=9faf5564-03de-40d0-a616-5ec8cc05ac1b>

### Create 'dateTime' table in DB

In [80]:
dateTime_csv_path = f"{folder_path}/dateTime.csv"

# Create 'dateTime' table in DB
dateTime_db_path = f"{project_id}.{dataset}.dateTime"
job = client.query(f"DELETE FROM {dateTime_db_path} WHERE TRUE").result()
client.create_table(bigquery.Table(dateTime_db_path), exists_ok = True)

Table(TableReference(DatasetReference('big-data-analytics-412816', 'practice_project'), 'dateTime'))

In [81]:
current_time = datetime.now(timezone('Asia/Shanghai'))
timestamp_string = current_time.isoformat()

In [82]:
dt = datetime.strptime(timestamp_string, '%Y-%m-%dT%H:%M:%S.%f%z')
date_time_str = dt.strftime('%d-%m-%Y %H:%M:%S')  # Date and time
time_zone = dt.strftime('%z')  # Time zone
output = f"{date_time_str}; GMT+{time_zone[2]} (SGT)"

In [83]:
dateTime_df = pd.DataFrame(data = [output], columns = ['dateTime'])
dateTime_df.to_csv(f"{folder_path}/dateTime.csv", header = True, index = False)

In [84]:
dateTime_job_config = bigquery.LoadJobConfig(
    autodetect=True,
    skip_leading_rows=1,
    source_format=bigquery.SourceFormat.CSV,
)
dateTime_config = client.dataset(dataset).table('dateTime')
with open(dateTime_csv_path, 'rb') as f:
    dateTime_load_job = client.load_table_from_file(f, dateTime_config, job_config=dateTime_job_config)
dateTime_load_job.result()

LoadJob<project=big-data-analytics-412816, location=US, id=bee32cda-a1bd-4562-b465-91f4d05cc433>

In [85]:
## Remove CSV files and folder
try:
    os.remove(apple_csv_path)
    os.remove(google_csv_path)
    os.remove(dateTime_csv_path)
    shutil.rmtree(f"{folder_path}apple-appstore-apps")
except:
    pass

In [9]:
# %%time

# apple_query = f"""
#     SELECT *
#     FROM {apple_db_path}
# """
# apple_df = read_gbq(apple_query, project_id)
# print(apple_df.shape)

# google_query = f"""
#     SELECT *
#     FROM {google_db_path}
# """
# google_df = read_gbq(google_query, project_id)
# print(google_df.shape)

  record_batch = self.to_arrow(


(1230376, 22)


  record_batch = self.to_arrow(


(2312944, 25)
CPU times: total: 2min 4s
Wall time: 36min 54s
