Last update: Feb 26, 2021

In [1]:
# Loading libraries for S3 bucket connection

import io
import sys
import boto3
import numpy as np
import pandas as pd
from datetime import date
sys.path.append('/home/ec2-user/SageMaker/Category_propensity_recs/')

client = boto3.client('s3') 
resource = boto3.resource('s3')

In [2]:
import pyarrow.parquet as pq
import s3fs
s3 = s3fs.S3FileSystem()

In [4]:
# Establishing snowflake and hive connections
p = '/home/ec2-user/SageMaker/Repos/data-science'
if p not in sys.path:
    sys.path.append(p)
    
from nm_data_sci.common import ssm_cnx, s3_util, db_util
sf_cnx = ssm_cnx.get_snowflake_connection()
scur= sf_cnx.cursor()
today = date.today()

In [5]:
def save_df_to_s3_gz(df_to_save,  dir_nameX, file_nameX, bucket_nameX='nmg-analytics-ds-prod'):
    """
       dir_nameX  = 'ds/prod/brand_affinity/nm/output/'
       file_nameX = 'designer_division_class_rank.csv'
       save_df_to_s3_gz(df , dir_nameX, file_nameX )
    """
    import gzip
    from io import BytesIO, TextIOWrapper
    
    gz_buffer = BytesIO()

    with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
        df_to_save.to_csv(TextIOWrapper(gz_file, 'utf8'), index=False,header=True,sep=',')
   
    
    s3_object = resource.Object(bucket_nameX, dir_nameX +file_nameX +'.gz')
    s3_object.put(Body=gz_buffer.getvalue())    
 
    return True


In [6]:
def upload_production(file_in_S3, table_name):
    scur.execute("""delete from  NMEDWPRD_DB.MLDM.{0}""".format(table_name))

    scur.execute("""copy into NMEDWPRD_DB.MLDM.{1}
    from @NMEDWPRD_DB.PUBLIC.AWS_DS_PRD_STG/ds/prod/product_propensity/nm/output/{0} file_format = (type = csv field_delimiter = ',') on_error = 'CONTINUE' """.format(file_in_S3, table_name))
    return True

### 5. Designer-class

In [7]:
#final= pd.read_csv('s3://nmg-analytics-ds-prod/ds/prod/product_propensity/nm/output/designer_class_propensity_rank/part-00000-ede9f03e-a067-43db-a688-0b955271f0c7-c000.csv.gz',header=None)
final = pq.ParquetDataset('s3://nmg-analytics-ds-prod/ds/{0}/product_propensity/{1}/output/designer_class_propensity'.format(env,brand), filesystem=s3).read_pandas().to_pandas()
final.columns = ['cmd_id','category','designer_id','designer','class_id','class','propensity_score','propensity_rank','next_best_rank']
final.sort_values(['cmd_id','propensity_rank'], inplace= True)


In [8]:
ts_creation = pd.Timestamp(today)
ts_creation = ts_creation.now()
ts_update = ts_creation.now()

final['RECORD_CREATION_DATETIME'] = ts_creation
final['RECORD_UPDATED_DATETIME'] = ts_update

In [9]:
n = 10     # number of recommended items required by ALS
extra = 5 # number of recommended if the first n items (n rows) purchased

In [10]:
extend_df= final.next_best_rank.isnull().groupby([final['cmd_id']]).sum().astype(int).reset_index(name='null_count')
keep_list= list(extend_df[extend_df.null_count < n]['cmd_id'])


In [11]:
final = final[(final['cmd_id'].isin(keep_list) & (final['propensity_rank'] <= n)) | ( np.logical_not(final['cmd_id'].isin(keep_list)) & (final['propensity_rank']<= n+ extra)) ]
#final.head(500).to_csv("designer_class_propensity_T.csv", index= False)

In [12]:
dir_nameX  = 'ds/prod/product_propensity/nm/output/'
file_nameX = 'Designer_Class_Propensity.csv'
save_df_to_s3_gz(final,  dir_nameX, file_nameX, bucket_nameX='nmg-analytics-ds-prod')

True

In [13]:
table_name = 'Designer_Class_Propensity_T'
file_in_S3 = 'Designer_Class_Propensity.csv'
upload_production(file_in_S3, table_name)

True