### The script 
- will prepare brand data which is used to recommend up to 5 recommended next category.
- browse data = 49 and trans for recent 3 years.
Note: for transaction data, we can avoid itm_item_status is void or cancel as select * 
From nmedwprd_db.pdwdm.all_sas_sales_and_open_orders_v
where itm_item_status not in ('V','CX')

it can be done as itm_item_status ='S'
   
**Last update**: Feb 28, 2021

In [1]:
import gc
import sys
import numpy as np
import pandas as pd
import boto3
import gc
from io import StringIO
client = boto3.client('s3') 
resource = boto3.resource('s3')
from datetime import date,datetime, timedelta
sys.path.append('../')
from Functions import common_header as h, tools as t, data_functions as d, ml_functions as m

#Code Parameters
brand = 'nm'
env = 'prod'
wbrand='NM'

In [2]:
# Date parameters
window = 49
duration = 30            # duration to retrieve categories that customers purcchased on
today = date.today()
idx = (today.weekday() + 1) % 7
last_sat = today - timedelta(7+idx-6)

run_1yr  = today - timedelta(days= 365)
run_3yr  = last_sat - timedelta(days=365*3)
recent_duration = (last_sat - h.timedelta(days= duration), last_sat, 'nm')
days_browse     = [last_sat - h.timedelta(days=window), last_sat, 'nm']
days_browse, recent_duration

([datetime.date(2021, 1, 9), datetime.date(2021, 2, 27), 'nm'],
 (datetime.date(2021, 1, 28), datetime.date(2021, 2, 27), 'nm'))

## Retrieve browse data of 49 days

## Retrieved transaction data

In [3]:
# Transaction data created by Designer_division_class_historical_ranking script for customers who are active from 
# the last recent 3 years

def read_df_from_s3_parquet( save_dir_nameX, bucket_nameX):
    import pyarrow.parquet as pq
    import s3fs
    s3 = s3fs.S3FileSystem()
    
    df = pq.ParquetDataset("s3a://"+bucket_nameX+'/'+save_dir_nameX, filesystem=s3).read_pandas().to_pandas()
    return df


## Query trans data

In [4]:
## Using this function to filter out data depending oon category
def retrieve_trans(trans_df, categories):
    """  last 3 yearstransaction with transfomation implemented in Designer_division_class_historical_ranking 
    script
    """
    cols = ['cmd_id']+ categories
    return trans_df[cols]

In [5]:
def query_browse(category):
    """ Retrieve all browse data who perform any product search for the last 49 days
    """
    sql="""select curr_cmd_id cmd_id ,{0} from nmedwprd_db.mktsand.click_stream_data;""".format(category)
    df = t.SF_read_sql_with_duplicate(sql)
    df = df[df.cmd_id != '-1']
    return df

In [6]:
def concat_columns(df, cols_to_concat, new_col_name, sep=" "):
    """ concatenate multiple columns
    """
    tmp = df.copy(deep= True)
    col_list = tmp.columns.values
    tmp[new_col_name] = tmp[cols_to_concat[0]]
    for col in cols_to_concat[1:]:
        tmp[new_col_name] = tmp[new_col_name].astype(str) + sep + tmp[col].astype(str)
    cols = tmp.columns.values
    return tmp[[cols[0],cols[-1]] +list(cols[1:-1])]

In [7]:
def agg_category(browse_df, trans_df, category):
    """ Aggregate CLV features per customer including Monetary value, Frequency and Recency 
    """
    group = list(browse_df.columns)
    browse_df = browse_df.groupby(group).size().reset_index(name= 'Times_Browse')
    group = list(trans_df.columns)
    trans_df  = trans_df.groupby(group).size().reset_index(name='Total_transactions')
    # Merge profiles
    cust= pd.merge(browse_df, trans_df,  how='outer', on= group)
    del browse_df, trans_df
    
    cust.fillna(value=0, axis=1, inplace = True)
   
    #Multiplying Times_Browsed and Total_Transactions by weights to build customer category taste profile
    cust['Transactions_W'] = cust['Total_transactions'].apply(lambda x: x*0.8)
    cust['Browsed_W'] = cust['Times_Browse'].apply(lambda x: x*0.2)

    #Summing the two weighted columns to obtained implicit rank unscaled 
    cust['rank_implicit'] = cust['Transactions_W'] + cust['Browsed_W']
    cust = cust.dropna(subset=['rank_implicit']) #Dropping NAN values
    cust['rank_implicit'] = pd.to_numeric(cust['rank_implicit'], errors='coerce')
    
    #Bucketizing column 'rank_implicit' based on quantiles so that it ranges from 1-5
    cust['score'] = np.where(cust['rank_implicit'] < cust['rank_implicit'].quantile(0.25),1,
	np.where((cust['rank_implicit'] >= cust['rank_implicit'].quantile(0.25)) & (cust['rank_implicit'] < cust['rank_implicit'].quantile(0.5)),2,
	np.where((cust['rank_implicit'] >= cust['rank_implicit'].quantile(0.5)) & (cust['rank_implicit'] < cust['rank_implicit'].quantile(0.75)),3,
	np.where((cust['rank_implicit'] >= cust['rank_implicit'].quantile(0.75)) & (cust['rank_implicit'] < cust['rank_implicit'].quantile(0.85)),4,5))))
    cust.drop(['Times_Browse','Total_transactions','Transactions_W','Browsed_W','rank_implicit'], axis=1, inplace= True)
    #cust.columns = ['cmd_id']+[category]+['score']
    return cust

In [8]:
def save_df_to_s3_gz(df_to_save,  dir_nameX, file_nameX, bucket_nameX='nmg-analytics-ds-prod'):
    """
       dir_nameX  = 'ds/prod/brand_affinity/nm/output/'
       file_nameX = 'designer_division_class_rank.csv'
       save_df_to_s3_gz(df , dir_nameX, file_nameX )
    """
    import gzip
    from io import BytesIO, TextIOWrapper
    
    gz_buffer = BytesIO()

    with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
        df_to_save.to_csv(TextIOWrapper(gz_file, 'utf8'), index=False,header=True,sep=',')
   
    
    s3_object = resource.Object(bucket_nameX, dir_nameX +file_nameX +'.gz')
    s3_object.put(Body=gz_buffer.getvalue())    
 
    return True

## 1. Division

In [9]:
browse_df = query_browse('division_name as category, division_id, division_name as division')
browse_df.dropna(axis=0, how= 'any',inplace= True)
browse_df.division_id = browse_df.division_id.astype('int')
browse_df.head()

Unnamed: 0,cmd_id,category,division_id,division
0,wcoVuNC,Contemporary Apparel,14,Contemporary Apparel
1,QYQApX5,Women's Apparel,11,Women's Apparel
2,yus8dAc,Men's,44,Men's
3,fM5P58d,Contemporary Apparel,14,Contemporary Apparel
5,1WSAPeXb,Ladies Shoes,34,Ladies Shoes


In [10]:
trans_df = read_df_from_s3_parquet( 'ds/prod/ClientConnect/CMD/data/nm/trans_3y_history.csv.gz','nmg-analytics-ds-prod')
trans_df = retrieve_trans(trans_df, ['division','division_id','division'])
trans_df.columns = ['cmd_id','category','division_id','division']
trans_df.head()

Unnamed: 0,cmd_id,category,division_id,division
0,17EtLPS8,Contemporary Apparel,14,Contemporary Apparel
1,uaAJGq0,Women's Apparel,11,Women's Apparel
2,qVD9A2t,Women's Apparel,11,Women's Apparel
3,ryYPgRF,Women's Apparel,11,Women's Apparel
4,ytUWIo1,Beauty,53,Beauty


In [11]:
trans_df = agg_category(browse_df, trans_df,'category')
trans_df.head()

Unnamed: 0,cmd_id,category,division_id,division,score
0,002vWGm,Contemporary Apparel,14,Contemporary Apparel,5
1,002vWGm,Designer Handbags,35,Designer Handbags,3
2,002vWGm,Jewelry,56,Jewelry,3
3,002vWGm,Ladies Shoes,34,Ladies Shoes,3
4,002wpFg,Children's,15,Children's,5


In [12]:
dir_name = 'ds/prod/ClientConnect/CMD/data/nm/'
save_df_to_s3_gz(trans_df, dir_name, 'cust_division_rank.csv', bucket_nameX='nmg-analytics-ds-prod')

True

## 2. Designer 

In [13]:
category= 'designer'
browse_df = query_browse('designer as category, designer_id, designer')
browse_df.dropna(axis=0, how= 'any',inplace= True)
browse_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer
0,ZrfAX64,Eileen Fisher,11658,Eileen Fisher
1,oCYyeUy,Marc Fisher LTD,15981,Marc Fisher LTD
2,NDSherR,TOM FORD,10978,TOM FORD
3,jPMKh37,Saint Laurent,11700,Saint Laurent
4,1WSNAz2B,Vince,15542,Vince


In [14]:
trans_df = read_df_from_s3_parquet( 'ds/prod/ClientConnect/CMD/data/nm/trans_3y_history.csv.gz','nmg-analytics-ds-prod')
trans_df = retrieve_trans(trans_df, ['designer','designer_id', 'designer'])
trans_df.columns = ['cmd_id','category','designer_id', 'designer']
trans_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer
0,17EtLPS8,Parker,14069,Parker
1,uaAJGq0,Johnny Was,12751,Johnny Was
2,qVD9A2t,Lafayette 148 New York,12797,Lafayette 148 New York
3,ryYPgRF,Jay Godfrey,12645,Jay Godfrey
4,ytUWIo1,Bobbi Brown,10781,Bobbi Brown


In [15]:
trans_df = agg_category(browse_df, trans_df,'category')


In [16]:
dir_name = 'ds/prod/ClientConnect/CMD/data/nm/'
save_df_to_s3_gz(trans_df, dir_name, 'cust_designer_rank.csv', bucket_nameX='nmg-analytics-ds-prod')


True

## 3. Class

In [17]:
browse_df = query_browse('class_name as category, class_id, class_name as class')
browse_df.dropna(axis=0, how= 'any',inplace= True)
browse_df.class_id = browse_df.class_id.astype('int')
browse_df.head()

Unnamed: 0,cmd_id,category,class_id,class
0,19FeQNhN,Sweaters,2,Sweaters
1,eBF3Ltg,Sneakers,55,Sneakers
2,tpOkzvZ,Sneakers,55,Sneakers
3,d0wuM56,Vests,155,Vests
4,hYcltc9,Leggings,7,Leggings


In [18]:
trans_df = read_df_from_s3_parquet( 'ds/prod/ClientConnect/CMD/data/nm/trans_3y_history.csv.gz','nmg-analytics-ds-prod')
trans_df = retrieve_trans(trans_df, ['class','class_id','class'])
trans_df.columns = ['cmd_id','category','class_id','class']

In [19]:
trans_df = agg_category(browse_df, trans_df,'category')
trans_df.head()

Unnamed: 0,cmd_id,category,class_id,class,score
0,002vWGm,Denim,6,Denim,5
1,002vWGm,Rings,40,Rings,5
2,002vWGm,Sandals,54,Sandals,3
3,002vWGm,Top Handle,29,Top Handle,3
4,002wpFg,Body Suits,107,Body Suits,5


In [20]:
dir_name = 'ds/prod/ClientConnect/CMD/data/nm/'
save_df_to_s3_gz(trans_df, dir_name, 'cust_class_rank.csv', bucket_nameX='nmg-analytics-ds-prod')


True

## 4. Designer_class

In [9]:
browse_df = query_browse('designer_id,designer, class_id, class_name as class')
browse_df.dropna(axis=0, how= 'any',inplace= True)
browse_df = concat_columns(browse_df, ['designer','class'], 'category', sep=" ")
browse_df.class_id = browse_df.class_id.astype('int')
browse_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer,class_id,class
0,ZYBA41J,Shoshanna Dress,14769,Shoshanna,124,Dress
1,jQSMvHx,Gorski Outerwear,12196,Gorski,9,Outerwear
2,aPwL9oq,Veronica Beard Denim,15452,Veronica Beard,6,Denim
3,X9v6h56,Vince Shirts/Tops,15542,Vince,1,Shirts/Tops
4,1Bi4C9V3,Valentino Sandals,11778,Valentino,54,Sandals


In [10]:
trans_df = read_df_from_s3_parquet( 'ds/prod/ClientConnect/CMD/data/nm/trans_3y_history.csv.gz','nmg-analytics-ds-prod')
trans_df = retrieve_trans(trans_df, ['designer_id','designer','class_id','class'])
trans_df.head()

Unnamed: 0,cmd_id,designer_id,designer,class_id,class
0,17EtLPS8,14069,Parker,124,Dress
1,uaAJGq0,12751,Johnny Was,1,Shirts/Tops
2,qVD9A2t,12797,Lafayette 148 New York,4,Pants
3,ryYPgRF,12645,Jay Godfrey,139,Gowns
4,ytUWIo1,10781,Bobbi Brown,317,Makeup


In [11]:
cols = ['designer','class']
trans_df = concat_columns(trans_df, cols , 'category', sep=" ")
trans_df = agg_category(browse_df, trans_df,'category')
trans_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer,class_id,class,score
0,002vWGm,Bottega Veneta Top Handle,10811,Bottega Veneta,29,Top Handle,3
1,002vWGm,Chanel Rings,11078,Chanel,40,Rings,1
2,002vWGm,Gianvito Rossi Sandals,12104,Gianvito Rossi,54,Sandals,1
3,002vWGm,Moussy Vintage Denim,23717,Moussy Vintage,6,Denim,1
4,002wpFg,A.L.C. Dress,10047,A.L.C.,124,Dress,5


In [12]:
dir_name = 'ds/prod/ClientConnect/CMD/data/nm/'
save_df_to_s3_gz(trans_df, dir_name, 'cust_designer_class_rank.csv', bucket_nameX='nmg-analytics-ds-prod')


True

## 5. Division_class

In [13]:
browse_df = query_browse('division_id,division_name as division, class_id, class_name as class')
browse_df.dropna(axis=0, how= 'any',inplace= True)
browse_df = concat_columns(browse_df, ['division','class'], 'category', sep=" ")
browse_df.division_id = browse_df.division_id.astype('int')
browse_df.class_id = browse_df.class_id.astype('int')
browse_df.head()

Unnamed: 0,cmd_id,category,division_id,division,class_id,class
0,diWz6ez,Contemporary Apparel Bras,14,Contemporary Apparel,42,Bras
1,19H3h8mr,Gifts & Home Quilts/Coverlets,21,Gifts & Home,822,Quilts/Coverlets
2,xZXsye6,Men's Slip On,44,Men's,880,Slip On
3,CKgao9B,Ladies Shoes Pumps,34,Ladies Shoes,51,Pumps
4,weI9f9y,Contemporary Apparel Denim,14,Contemporary Apparel,6,Denim


In [14]:
trans_df = read_df_from_s3_parquet( 'ds/prod/ClientConnect/CMD/data/nm/trans_3y_history.csv.gz','nmg-analytics-ds-prod')
trans_df = retrieve_trans(trans_df, ['division_id','division','class_id','class'])
trans_df.head()

Unnamed: 0,cmd_id,division_id,division,class_id,class
0,17EtLPS8,14,Contemporary Apparel,124,Dress
1,uaAJGq0,11,Women's Apparel,1,Shirts/Tops
2,qVD9A2t,11,Women's Apparel,4,Pants
3,ryYPgRF,11,Women's Apparel,139,Gowns
4,ytUWIo1,53,Beauty,317,Makeup


In [15]:
cols = ['division','class']
trans_df = concat_columns(trans_df, cols , 'category', sep=" ")
trans_df.head()

Unnamed: 0,cmd_id,category,division_id,division,class_id,class
0,17EtLPS8,Contemporary Apparel Dress,14,Contemporary Apparel,124,Dress
1,uaAJGq0,Women's Apparel Shirts/Tops,11,Women's Apparel,1,Shirts/Tops
2,qVD9A2t,Women's Apparel Pants,11,Women's Apparel,4,Pants
3,ryYPgRF,Women's Apparel Gowns,11,Women's Apparel,139,Gowns
4,ytUWIo1,Beauty Makeup,53,Beauty,317,Makeup


In [16]:
trans_df = agg_category(browse_df, trans_df,'category')

trans_df.head()

Unnamed: 0,cmd_id,category,division_id,division,class_id,class,score
0,002vWGm,Contemporary Apparel Denim,14,Contemporary Apparel,6,Denim,5
1,002vWGm,Designer Handbags Top Handle,35,Designer Handbags,29,Top Handle,3
2,002vWGm,Jewelry Rings,56,Jewelry,40,Rings,5
3,002vWGm,Ladies Shoes Sandals,34,Ladies Shoes,54,Sandals,3
4,002wpFg,Children's Infant Boy,15,Children's,826,Infant Boy,1


In [17]:
dir_name = 'ds/prod/ClientConnect/CMD/data/nm/'
save_df_to_s3_gz(trans_df, dir_name, 'cust_division_class_rank.csv', bucket_nameX='nmg-analytics-ds-prod')


True

## 6. Designer_division

In [9]:
browse_df = query_browse('designer_id,designer, division_id, division_name as division')
browse_df.dropna(axis=0, how= 'any',inplace= True)
browse_df = concat_columns(browse_df, ['designer','division'], 'category', sep=" ")
browse_df.division_id = browse_df.division_id.astype('int')
browse_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer,division_id,division
0,ofzyTAs,Safavieh Gifts & Home,14614,Safavieh,21,Gifts & Home
1,14IEu9jL,Helena Children's,12338,Helena,15,Children's
2,eBf4XIx,PINKO Contemporary Apparel,21894,PINKO,14,Contemporary Apparel
3,lsrMEEm,NIC+ZOE Women's Apparel,13837,NIC+ZOE,11,Women's Apparel
4,QbgoudT,AG Men's,10122,AG,44,Men's


In [10]:
trans_df = read_df_from_s3_parquet( 'ds/prod/ClientConnect/CMD/data/nm/trans_3y_history.csv.gz','nmg-analytics-ds-prod')
trans_df = retrieve_trans(trans_df, ['designer_id','designer','division_id','division'])
cols = ['designer','division']
trans_df = concat_columns(trans_df, cols , 'category', sep=" ")
trans_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer,division_id,division
0,17EtLPS8,Parker Contemporary Apparel,14069,Parker,14,Contemporary Apparel
1,uaAJGq0,Johnny Was Women's Apparel,12751,Johnny Was,11,Women's Apparel
2,qVD9A2t,Lafayette 148 New York Women's Apparel,12797,Lafayette 148 New York,11,Women's Apparel
3,ryYPgRF,Jay Godfrey Women's Apparel,12645,Jay Godfrey,11,Women's Apparel
4,ytUWIo1,Bobbi Brown Beauty,10781,Bobbi Brown,53,Beauty


In [11]:
trans_df = agg_category(browse_df, trans_df,'category')
trans_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer,division_id,division,score
0,002vWGm,Bottega Veneta Designer Handbags,10811,Bottega Veneta,35,Designer Handbags,4
1,002vWGm,Chanel Jewelry,11078,Chanel,56,Jewelry,1
2,002vWGm,Gianvito Rossi Ladies Shoes,12104,Gianvito Rossi,34,Ladies Shoes,1
3,002vWGm,Moussy Vintage Contemporary Apparel,23717,Moussy Vintage,14,Contemporary Apparel,1
4,002wpFg,A.L.C. Contemporary Apparel,10047,A.L.C.,14,Contemporary Apparel,5


In [12]:
dir_name = 'ds/prod/ClientConnect/CMD/data/nm/'
save_df_to_s3_gz(trans_df, dir_name, 'cust_designer_division_rank.csv', bucket_nameX='nmg-analytics-ds-prod')


True

## 7.Designer_Division_Class

In [9]:
browse_df = query_browse('designer_id,designer, division_id, division_name as division, class_id, class_name as class')
browse_df.dropna(axis=0, how= 'any',inplace= True)
browse_df = concat_columns(browse_df, ['designer','division','class'], 'category', sep=" ")
browse_df.division_id = browse_df.division_id.astype('int')
browse_df.class_id = browse_df.class_id.astype('int')
browse_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer,division_id,division,class_id,class
0,18M1XY0T,Eileen Fisher Women's Apparel Jackets,11658,Eileen Fisher,11,Women's Apparel,3,Jackets
1,iFUyae3,Christian Louboutin Ladies Shoes Sandals,11135,Christian Louboutin,34,Ladies Shoes,54,Sandals
3,jUj2I3B,Prada Ladies Shoes Sandals,12136,Prada,34,Ladies Shoes,54,Sandals
4,kLwm15V,Talbot Runhof Fine Apparel Dress,15074,Talbot Runhof,81,Fine Apparel,124,Dress
5,wnf9BZ3,W. Kleinberg Men's Belts,15580,W. Kleinberg,44,Men's,877,Belts


In [10]:
trans_df = read_df_from_s3_parquet( 'ds/prod/ClientConnect/CMD/data/nm/trans_3y_history.csv.gz','nmg-analytics-ds-prod')
trans_df = retrieve_trans(trans_df, ['designer_id','designer','division_id','division','class_id','class'])
trans_df.head()

Unnamed: 0,cmd_id,designer_id,designer,division_id,division,class_id,class
0,17EtLPS8,14069,Parker,14,Contemporary Apparel,124,Dress
1,uaAJGq0,12751,Johnny Was,11,Women's Apparel,1,Shirts/Tops
2,qVD9A2t,12797,Lafayette 148 New York,11,Women's Apparel,4,Pants
3,ryYPgRF,12645,Jay Godfrey,11,Women's Apparel,139,Gowns
4,ytUWIo1,10781,Bobbi Brown,53,Beauty,317,Makeup


In [11]:
cols = ['designer','division','class']
trans_df = concat_columns(trans_df, cols , 'category', sep=" ")
trans_df = agg_category(browse_df, trans_df,'category')
trans_df.head()

Unnamed: 0,cmd_id,category,designer_id,designer,division_id,division,class_id,class,score
0,002vWGm,Bottega Veneta Designer Handbags Top Handle,10811,Bottega Veneta,35,Designer Handbags,29,Top Handle,3
1,002vWGm,Chanel Jewelry Rings,11078,Chanel,56,Jewelry,40,Rings,1
2,002vWGm,Gianvito Rossi Ladies Shoes Sandals,12104,Gianvito Rossi,34,Ladies Shoes,54,Sandals,1
3,002vWGm,Moussy Vintage Contemporary Apparel Denim,23717,Moussy Vintage,14,Contemporary Apparel,6,Denim,1
4,002wpFg,A.L.C. Contemporary Apparel Dress,10047,A.L.C.,14,Contemporary Apparel,124,Dress,5


In [12]:
dir_name = 'ds/prod/ClientConnect/CMD/data/nm/'
save_df_to_s3_gz(trans_df, dir_name, 'cust_designer_division_class_rank.csv', bucket_nameX='nmg-analytics-ds-prod')


True

## Business rule

### Load transaction data

**Notice**: The following cell is the current logic by NM that will be used later

#### Creating customer profile df with browse and transaction data

In [None]:
# Merge profiles
cust_profile = pd.merge(browse_data, trans_df,  how='outer', on=['cmd_id','division'])
cust_profile.isnull().sum()

In [None]:
del browse_data, trans_df

In [None]:
cust_profile.fillna(value=0, axis=1, inplace = True)
cust_profile.head()

**Notice**: customer brand taste profile (implicit ranking) is create by the following logic:   
total_transaction * 0.8 + total_browsed * 0.2   

* Any information of duration of time, customer stayed in the particular page? 
* historical trans that boughts similar products (collection) 
* Whether purchased product w/w o promotion? espcially the last two transactions: w promotion may indicate customer make purchase by price or w o illustrate by the need
* check purchase pattern by season, with or without promo, repeat of one time purchase in the last 2 year?: a seasonal may be a factor to buy a gift as Xmas to someone.


In [None]:
#Multiplying Times_Browsed and Total_Transactions by weights to build customer category taste profile
cust_profile['Transactions_W'] = cust_profile['Total_transactions'].apply(lambda x: x*0.8)
cust_profile['Browsed_W'] = cust_profile['Times_Browse'].apply(lambda x: x*0.2)

#Summing the two weighted columns to obtained implicit rank unscaled 
cust_profile['rank_implicit'] = cust_profile['Transactions_W'] + cust_profile['Browsed_W']
cust_profile = cust_profile.dropna(subset=['rank_implicit'])#Dropping NAN values
cust_profile['rank_implicit'] = pd.to_numeric(cust_profile['rank_implicit'], errors='coerce')
cust_profile.head(10)

In [None]:

#Bucketizing column 'rank_implicit' based on quantiles so that it ranges from 1-5
cust_profile['rank_bckt'] = np.where(cust_profile['rank_implicit'] < cust_profile['rank_implicit'].quantile(0.25),1,
	np.where((cust_profile['rank_implicit'] >= cust_profile['rank_implicit'].quantile(0.25)) & (cust_profile['rank_implicit'] < cust_profile['rank_implicit'].quantile(0.5)),2,
	np.where((cust_profile['rank_implicit'] >= cust_profile['rank_implicit'].quantile(0.5)) & (cust_profile['rank_implicit'] < cust_profile['rank_implicit'].quantile(0.75)),3,
	np.where((cust_profile['rank_implicit'] >= cust_profile['rank_implicit'].quantile(0.75)) & (cust_profile['rank_implicit'] < cust_profile['rank_implicit'].quantile(0.85)),4,5))))
cust_profile.head()

In [None]:
cust_profile.rank_bckt.value_counts()

In [None]:
cust_profile.rank_bckt.unique()

In [None]:
cust_profile.drop(['Times_Browse','Total_transactions','Transactions_W','Browsed_W','rank_implicit'], axis=1, inplace= True)

In [None]:
# Grabbing top 15 brand_name combinations for each customer
cust_profile = cust_profile.sort_values(['cmd_id','rank_bckt'], ascending=False)
cust_profile = cust_profile.groupby(['cmd_id']).head(15)
cust_profile.head()

In [None]:
cust_profile.nunique()

In [None]:
cust_profile.shape

In [None]:
# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO
client = boto3.client('s3') 
resource = boto3.resource('s3') 

In [None]:
# Saving results to S3
env='prod'
brand='nm'
csv_buffer = StringIO()
cust_profile.to_csv(csv_buffer,index=False)
resource.Object('nmg-analytics-ds-prod', 'ds/{0}/ClientConnect/CMD/data/{1}/cust_division_rank.csv'.format(env,brand)).put(Body=csv_buffer.getvalue())

## Retrieve  new rating feature:   
based on [paper](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.709.825&rep=rep1&type=pdf)    
**credit:** Xoriant team

## To-do: convert web_item_id --> Designer

In [None]:
# REad data from S3
obj1 = client.get_object(Bucket='nmg-analytics-ds-prod', Key='ds/prod/ClientConnect/CMD/data/nm/cust_rating.csv')
df = pd.read_csv(obj1['Body'])

df.head()

In [None]:
df.shape

In [None]:
cust_profile = cust_profile.merge(df, left_on='cmd_id', right_on= 'curr_customer_id' ,how= 'left')
cust_profile['combined_rating'] = cust_profile['rank_bckt'] + cust_profile['rating']
cust_profile.drop(['curr_customer_id','action_type','web_item_id','day','number_of_events','sum_of_events','Pu'],axis = 1, inplace = True)
cust_profile.fillna(0, inplace= True)

In [None]:
cust_profile.head()


In [None]:
cust_profile.drop('rating', axis=1, inplace = True)
cust_profile.head()  

In [None]:
# Saving results to S3
env='prod'
brand='nm'
csv_buffer = StringIO()
cust_profile.to_csv(csv_buffer,index=False)
resource.Object('nmg-analytics-ds-prod', 'ds/{0}/ClientConnect/CMD/data/{1}/cust_division.csv'.format(env,brand)).put(Body=csv_buffer.getvalue())