In [70]:
# Imports
import numpy as np
import pandas as pd
import pyspark
from google.cloud import bigquery

In [3]:
client = bigquery.Client(location="US")
print("Client creating using default project: {}".format(client.project))

client = bigquery.Client(location="US", project="helio-staging")

Client creating using default project: helio-staging


In [4]:
rev_source = ["sephora","ulta"]

job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ArrayQueryParameter("revSource", "STRING", rev_source)
        ]
)
max_date_query = """
    SELECT DISTINCT execution_date
    FROM `helio-staging.online_reviews.online_reviews__deduped_review__1_0`
    where execution_date > DATE_ADD(current_date(), INTERVAL -10 DAY)
    and normalized_url IS NOT NULL
    ORDER BY execution_date DESC
    LIMIT 2
    """
query_job = client.query(
    max_date_query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
    job_config=job_config
)  # API request - starts the query

#max_date = query_job.to_dataframe()['f0_'].astype('str')[0]



In [None]:
max_date

In [5]:
qj = query_job.to_dataframe()
max_date, prev_date = qj["execution_date"].astype('str')[0],qj["execution_date"].astype('str')[1]


In [53]:
max_date = '2022-03-20'

In [54]:
prev_date = '2022-03-19'

In [55]:
job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ScalarQueryParameter("max_date", "STRING", max_date),
        bigquery.ArrayQueryParameter("revSource", "STRING", rev_source)
    ]
)
review_query = """
    SELECT distinct source_name,
    normalized_url,
    FROM `helio-staging.online_reviews.online_reviews__deduped_review__1_0`
    WHERE source_name in UNNEST (@revSource)
    AND normalized_url IS NOT NULL
    AND execution_date = @max_date
    """
query_job = client.query(
    review_query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
    job_config=job_config
)  # API request - starts the query

brands = query_job.to_dataframe()
brands = brands.loc[brands.astype(str).drop_duplicates().index]
brands["combined"] = brands["source_name"] +" "+brands["normalized_url"] 
brands = brands["combined"].tolist()

In [56]:
brands

['ulta gobareoutside.com',
 'ulta holsterbrands.com',
 'sephora caliraybeauty.com',
 'ulta oseamalibu.com',
 'ulta cocokind.com',
 'sephora algenist.com',
 'ulta alternahaircare.com',
 'ulta tonymoly.us',
 'ulta volitionbeauty.com',
 'ulta vincecamuto.com',
 'ulta hairology.com',
 'sephora cleanbeauty.com',
 'ulta rebelsrefinery.com',
 'ulta redcarpetmanicure.com',
 'sephora golde.co',
 'ulta nablacosmetics.com',
 'ulta lakeandskye.com',
 'ulta honest.com',
 'ulta mizani.com',
 'ulta carbontheory.com',
 'sephora captainblankenship.com',
 'sephora eadem.co',
 'ulta getjackblack.com',
 'ulta goodalcosmetic.com',
 'ulta unwash.com',
 'ulta thevintagecosmeticcompany.com',
 'sephora deborahlippmann.com',
 'ulta brite.com',
 'ulta virtuelabs.com',
 'sephora balenciaga.com',
 'sephora foursigmatic.com',
 'ulta locksandmane.com',
 'ulta sweetspotlabs.com',
 'ulta sweetspot.com',
 'ulta thefoxtan.com',
 'ulta hyntbeauty.com',
 'sephora urbandecay.com',
 'sephora peterthomasroth.com',
 'ulta lif

In [57]:
job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ScalarQueryParameter("prev_date", "STRING", prev_date),
        bigquery.ArrayQueryParameter("revSource", "STRING", rev_source)
    ]
)
review_query = """
    SELECT distinct source_name,
    normalized_url,
    FROM `helio-staging.online_reviews.online_reviews__deduped_review__1_0`
    WHERE source_name in UNNEST (@revSource)
    AND normalized_url IS NOT NULL
    AND execution_date = @prev_date
    """
query_job = client.query(
    review_query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
    job_config=job_config
)  # API request - starts the query

prev_brands = query_job.to_dataframe()
prev_brands = prev_brands.loc[prev_brands.astype(str).drop_duplicates().index]
prev_brands["combined"] = prev_brands["source_name"] +" "+prev_brands["normalized_url"] 
prev_brands = prev_brands["combined"].tolist()

In [58]:
new_brands = [x for x in brands if x not in prev_brands]

In [59]:
new_brands

['ulta teraxhaircare.com']

In [68]:
job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ScalarQueryParameter("prev_date", "STRING", prev_date),
        bigquery.ArrayQueryParameter("revSource", "STRING", rev_source)
    ]
)
review_query = """
    SELECT distinct source_name,
    normalized_url,
    FROM `helio-staging.online_reviews.online_reviews__deduped_review__1_0`
    WHERE source_name in UNNEST (@revSource)
    AND normalized_url IS NOT NULL
    AND execution_date = @prev_date
    """
query_job = client.query(
    review_query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
    job_config=job_config
)  # API request - starts the query

newbrands = query_job.to_dataframe()
newbrands = newbrands.loc[newbrands.astype(str).drop_duplicates().index]

In [69]:
newbrands

Unnamed: 0,source_name,normalized_url
0,ulta,loccitane.com
1,ulta,mynuface.com
2,sephora,rodinoliolusso.com
3,ulta,babyfoot.com
4,ulta,johnfrieda.com
...,...,...
1199,ulta,goli.com
1200,ulta,niahealthcare.com
1201,ulta,themanechoice.com
1202,sephora,drjart.com


In [22]:
rev_source = ["sephora","ulta"]

job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ArrayQueryParameter("revSource", "STRING", rev_source)
        ]
)
max_date_query = """
    SELECT DISTINCT execution_date
    FROM `helio-staging.online_reviews.online_reviews__deduped_review__1_0`
    where execution_date > DATE_ADD(current_date(), INTERVAL -365 DAY)
    and normalized_url IS NOT NULL
    ORDER BY execution_date DESC
    """
query_job = client.query(
    max_date_query,
    # Location must match that of the dataset(s) referenced in the query.
    location="US",
    job_config=job_config
)  # API request - starts the query

list_of_dates = query_job.to_dataframe()



In [23]:
list_of_dates

Unnamed: 0,execution_date
0,2022-03-23
1,2022-03-22
2,2022-03-21
3,2022-03-20
4,2022-03-19
...,...
278,2021-04-29
279,2021-04-25
280,2021-04-11
281,2021-04-04


In [50]:
for i in list_of_dates:
    current_date = i
    job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ScalarQueryParameter("current_date", "STRING", current_date),
            bigquery.ArrayQueryParameter("revSource", "STRING", rev_source)
        ]
    )
    review_query = """
        SELECT count(distinct source_name || normalized_url)
        FROM `helio-staging.online_reviews.online_reviews__deduped_review__1_0`
        WHERE source_name in UNNEST (@revSource)
        AND execution_date = @current_date
        """
    query_job = client.query(
        review_query,
        # Location must match that of the dataset(s) referenced in the query.
        location="US",
        job_config=job_config
    )  # API request - starts the query

    brands = query_job.to_dataframe()
    brands = reviews.loc[reviews.astype(str).drop_duplicates().index]

BadRequest: 400 Cannot query over table 'helio-staging.online_reviews.online_reviews__deduped_review__1_0' without a filter over column(s) 'execution_date' that can be used for partition elimination

(job ID: 33f2f8c5-c75c-4ce7-b378-bd1f4513fcb9)

                          -----Query Job SQL Follows-----                           

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:
   2:        SELECT count(distinct source_name || normalized_url)
   3:        FROM `helio-staging.online_reviews.online_reviews__deduped_review__1_0`
   4:        WHERE source_name in UNNEST (@revSource)
   5:        AND execution_date = @current_date
   6:        
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |