In [222]:
import sys
import pandas as pd
import numpy as np
import plotly.express as px
import xgboost as xgb
from datetime import datetime


from sklearn import preprocessing
from sklearn.model_selection import train_test_split



In [198]:
shared_drive = r"G:\.shortcut-targets-by-id\184tVjsIO-GAjbkSakwDbEZ40M5mPpgu4\Capstone\cleaned_data"
google_drive = r"G:\My Drive\Spring_2022\CS554\Project\data"
drive = r"D:\Users\yiboz\Programming\Github\CS554\data"
articles = pd.read_csv(shared_drive+r"\articles_clean.csv", dtype={'article_id':str})
customers = pd.read_csv(shared_drive+r"\customers_clean.csv", dtype={'customer_id':str})
transactions = pd.read_csv(shared_drive+r"\transactions_train.csv", dtype={'article_id':str, 'customer_id':str})
sample = pd.read_csv(shared_drive+r"\sample_submission.csv")

NOTE: Look back again on how the Movie reccomendation rating thing worked, might need 100k+ columns for 1.3mil customers (looking at possibly 10BIL+ rows), regression rating on each. Definetely need pyspark. Using ratings to rank their top 12 preferences 


Questions: 
    
    1. How to use a list of article_ids as outputs? 
    2. How to get a list of predictions of article_ids? 
    
    3. How are we supposed to use the transaction date values? Do we want to use it? 
    4. What are the significant columns in the articles table we wanted to use? Need to use Chi-square to test. 
    5. Articles have a high cardinality and I need to reduce their dimension? What is the best way to encode the article categories? 
        - https://pbpython.com/categorical-encoding.html 
    6. What is the type of regressor I want to use for my ranking data? 


The Plan: Need to use regression on ratings of all articles? (just use articles from Transactions)  
    
    Ratings: Get value_counts of each article_id bought and normalize by taking (Max_frequency - Min_frequency) (need pivot table). Answers questions 1 and 2. 
    
    Encoding: Categorical vs Binary due to High Cardinality of article_id 
    
    Buckets: Group customers into buckets of close age brackets (possibly based on the distribution) and other characteristics 
    
    Reduction of Articles: drop the bottom percent of least popular articles
    
    Reduction of Customers: use only a small percentage of customers to train/test the data, then get recommendations on everyone else 
    
    Regressor: Try 'reg:squarederror', need to look into rank:pairwise and rank:map
    
    Evaluation: default set by objective function, ex. 'rmse'. 

TODO: figure out the most significant articles columns, all categorical so use Chi-squared test 

X-values (did we ever figure out the most significant columns for all of the articles?): ['customer_id', 'age', 'FN', 'Active']

    Need to seperate customer and article characteristics for a hybrid approach, possibly combine afterwards 

Y-values: ratings on all articles 

Observations: 

    1. The data is horribly imbalanced, with thousands of users having only purchased one item 

In [3]:
# index group is the smallest at 5 different categories 
mini = sys.maxsize
col = ''
for i in articles.columns: 
    n = articles[i].unique().size
    if mini > n: 
        mini = n
        col = i
print("col: "+col+" size: "+ str(mini))


col: index_group_no size: 5


# Initialize Constants

In [264]:
customer_attributes = ['age', 'FN', 'Active', 'customer_id']
article_attributes = ['price', 'index_group_name', 'article_id']
target = "rating"
nums = 4

In [4]:
df = transactions.sample(frac=0.01)
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
27908958,2020-06-24,4155ad2d5445d70b493129ee1501f38445c62301a6c6d5...,857855003,0.025407,1
8763190,2019-04-14,ab4a687ea5bf4d36dadf45c4c7c3ff873fa9856d90ffcb...,748983002,0.059305,2
31409229,2020-09-11,ec0d5fda8911b4e7a7eb7f77dfdd210c0b9299610d3ee9...,850917001,0.025407,2
5223656,2019-01-20,e6927a969e181c6392bf7edf8a5a1d92b766123fa218f3...,690502001,0.152525,2
31638912,2020-09-18,964d40d4da8c6875af2340291e78e6088dd4a9a06ff67e...,240561001,0.010966,1


# Initialize Functions

In [283]:
# returns the number of features necessary for binary encoding 
# Input: cardinality of feature
# Output: float of the number of columns needed to represent it in binary encoding 
def binary_features(n): 
    return np.ceil(np.log(n+1)/np.log(2))

# Get purchase counts of each article for each customer 
# Input: df=base dataframe 
# Output: dataframe with a column for those purchase counts 
def purchase_count(df, customer="customer_id", projectPath=False): # from Xingeng 
    customArticlePair = df[[customer, "article_id"]].copy()
    # customArticlePair.loc[:,"rating"] = pd.Series([1 for x in range(len(customArticlePair))]) # Add a column of purchase count
    customArticlePair["count"] = 1
    # customArticlePair.to_csv("/content/gdrive/MyDrive/Capstone/co_filter/customer_article_pair.csv",index=False)
    countGroup = customArticlePair.groupby([customer, "article_id"]).count() # count purchase numbers
    
    if projectPath != False: 
        countGroup.to_csv(f"{projectPath}/customer_article_count.csv") # Save Counts to csv file
    return countGroup.reset_index()

# get ratings of each article for each customer by taking (article count for customer) / (max count for customer)
# Input: df=dataframe with purchase counts 
# Output: df with 'ratings' column 
def norm_ratings(df): 
    tempmax = df.groupby('customer_id')['count'].max().rename("temp_max") # get max for each customer_id 
    temp = df.merge(tempmax, how='left', on='customer_id') # merge the data together 
    temp['rating'] = temp['count'] / temp['temp_max'] # get true rating 
    return temp.drop(['count', 'temp_max' ], axis=1)

# get ratings on buckets 
def get_bucket_ratings(n, goal, projectPath=False):
    # Merge transactions and customers data
    trans_custs = transactions.merge(customers[customer_attributes].set_index('customer_id'), on='customer_id', how='left')
    trans_custs = trans_custs[['customer_id', 'article_id', 'price', 'age', 'FN', 'Active']]
    trans_custs['count'] = 1
    
    # get buckets and the article counts for the buckets 
    buckets = pd.qcut(trans_custs['age'], n)
    buckets_age = trans_custs.groupby([buckets, "FN", "Active"])['age'].mean().to_frame()
    buckets_age = buckets_age.rename(columns={'age': 'mean_age'})
    
    buckets_count = trans_custs.groupby([buckets, "FN", "Active", 'article_id'])['count'].sum().to_frame()
    merged_buckets = buckets_count.reset_index().merge(buckets_age['mean_age'], on=['age', 'FN', 'Active'], how='left')
    age_intervals = merged_buckets['age'].unique() 
    
    # get article ratings for the buckets 
    tempmax = merged_buckets.groupby(["age", "FN", "Active"])['count'].max().rename("temp_max") # get max for each bucket 
    temp = merged_buckets.merge(tempmax, how='left', on=["age", "FN", "Active"]) # merge the max for each bucket 
    temp[goal] = temp['count'] / temp['temp_max'] # get bucket rating 
    
    ratings = temp.drop(['count', 'temp_max'], axis=1)
    art_prices = transactions[['article_id', 'price']].drop_duplicates(subset='article_id')
    ratings = ratings.merge(art_prices.set_index('article_id'), on='article_id', how='left') # merge price back in 
    
    if projectPath != False: 
        result.to_csv(f"{projectPath}/bucket_ratings.csv") # Save Counts to csv file
    return ratings

# map categorical variables to numeric or binary, returns a mapped df and decoder dictionary 
# Input: df=base dataframe, col=column to create a mapping for, to_bin=map to binary 
# Output: df with a 'col_mapped' column, decoder dictionary for the mapping 
def mapper(df, col, to_bin=False): 
    n = df[col].unique().size
    mp = dict.fromkeys(range(n*2))
    de = dict.fromkeys(range(n*2))
    j = 1 
    
    if to_bin:
        b = str(int(binary_features(n)))
        f = "{:0>"+b+"b}"
    
    for i in df[col].unique(): 
        if to_bin: 
            mp[i] = f.format(j)
            de[f.format(j)] = i
        else: 
            mp[i] = j
        
        j += 1   
    df[col+'_mapped'] = df[col].apply(lambda x: mp[x] )
    return df, de


# creating binary encoding for a column  
# Input: df=base dataframe, col=column to encode into binary 
# Output: dataframe with n columns for binary encoding of a categorical variable, decoder dictionary 
def binary_encoder(df, col): 
    encoding = df[col].to_frame().copy() 
    mapped, de = mapper(encoding, col, to_bin=True)
    
    f = lambda x: col+'_b_{}'.format(x + 1)
    binned = pd.DataFrame(mapped.pop(col+'_mapped').apply(list).values.tolist()).apply(pd.to_numeric).rename(columns=f)
    return mapped.join(binned), de
    

# Raw to train, preps all data for xgboosting 
# Input: df=base dataframe, att_df=datframe with the user or article attributes, atts=list of attributes, 
# att_id=column name of attribute, cats=name of categorical variable to encode, goal=name of target column 
# Output: Dataframe of attributes, dataframe of ratings, decoder dictionary 
def train_data_prep(df, att_df, atts, att_id, cats, goal): 
    purchases = purchase_count(df)
    normed = norm_ratings(purchases)
    cust = att_df[atts].copy().set_index(att_id)
    merged_df = normed.merge(cust, how='left', on=att_id).drop(att_id, axis=1)
    
    encoded, de = binary_encoder(merged_df[cats].to_frame().copy(), cats) 
    
    merged_df = merged_df.join(encoded.drop(cats, axis=1))
    y_all = merged_df[goal].to_frame()
    X_all = merged_df.drop([goal, cats], axis=1)
    
    return X_all, y_all, de
    #return merged_df
    
# Raw to train, preps all data for xgboosting 
# Input: goal=name of target column, cats=list of column string names of categories that need to be encoded 
# Output: Dataframe of attributes, dataframe of ratings, customer to bucket mapper 
def train_buckets_prep(n, goal, cats):  
    merged_df = get_bucket_ratings(n, goal)
    
    # encode given categorical variables using binary encoding 
    des = [] # for encoder debugging purposes 
    for c in cats: 
        encoded, de = binary_encoder(merged_df[c].to_frame().copy(), c) 
        merged_df = merged_df.join(encoded.drop(c, axis=1))
        des.append(de)
    
    merged_df = merged_df.drop('age', axis=1)
    y_all = merged_df[goal].to_frame()
    X_all = merged_df.drop([goal]+cats, axis=1)
    
    return X_all, y_all
    #return X_all, y_all, des 
    #return X_all, y_all, des 
    
# Breaks down intervals into group indexes that all customer_ids map onto 
# Input: df=customers dataframe, n = number of intervals 
# Output: dataframe with mapping from customer_id to group 
def grouper(df, n): 
    i = 0
    cg = []
    df = df.copy()
    df['group'] = 0
    
    intervals = pd.qcut(df['age'], n)
    dfg = df.groupby([intervals, "FN", "Active"])
    
    for x in dfg.groups: 
        gg = dfg.get_group(x)
        
        gg['group'] = i
        cg.append(gg)
        i +=1 
        
    pc = pd.concat(cg)
    
    return pc[['customer_id', 'group']]

# Given prediction test x and test y, get root mean square prediction 
# Input: x=testing attributes, y=testing ratings 
# Output: root mean squared error between x and y 
def test_pred_rms(x, y): 
    pred = pd.DataFrame(x, columns=['pred'])
    
    y = y.reset_index()
    del y['index']
    
    result = pd.concat([pred, y], ignore_index=True, axis=1, join='inner')
    
    f = lambda x: 'p_{}'.format(x + 1)
    result = result.rename(columns=f)
    result['diff'] = result['p_1'] - result['p_2']
    result['square'] = result['diff'] ** 2 

    return np.sqrt(result['square'].mean()) 

# prepares the data for a user ID for prediction in the finished model 
# Input: user=string of customer_id, encoded_articles=pre-built dataframe of all unique articles in the set
# Output: customer attributes with encoded, dataframe of customer and article IDs for decoding 
def test_user_prep(user, encoded_articles):
    cust = encoded_articles.copy()
    cust['customer_id'] = user 
    
    ids = cust[['customer_id', 'article_id']].copy() # create a frame to easily remap customers and articles  
    
    cust = cust.merge(customers[customer_attributes].set_index('customer_id'), on='customer_id', how='left')
    cust = cust[['age', 'FN', 'Active']+[c for c in cust if c not in ['age', 'FN', 'Active']]]
    
    return cust.drop(['customer_id', 'article_id'], axis=1), ids
  

# Turn prediction values into a recommendation for a single user 
# Input: df=predicted rating scores, ids=decoding IDs for customers and articles, n=top number of ratings to get 
# Output: dataframe with recommendation in submission form 
def get_user_rec(df, ids, n):
    
    pred = pd.DataFrame(df, columns=['rating']).copy()
    top12 = pred.sort_values(["rating"], ascending=False).head(n)
    joined = ids.reset_index().merge(top12, left_on='index', right_on=top12.index, how='inner')
    rec = pd.DataFrame(joined.groupby(by='customer_id').apply(agg_articles)).reset_index().rename(columns={0:'prediction'})
    
    return rec 
    
# aggregate top articles into 
# Input: df=dataframe of top articles 
# Output: string of those articles in submission form 
def agg_articles(df): 
    return ' '.join(df['article_id'].tolist())   

# prep all the buckets 

def test_all_prep(n, model):
    
    arts = pd.DataFrame(transactions['article_id'].unique(), columns=['article_id'])
    encoded_articles, dec = binary_encoder(arts, 'article_id')
    
    art_prices = transactions[['article_id', 'price']].drop_duplicates(subset='article_id')
    encoded_articles = encoded_articles.merge(art_prices.set_index('article_id'), on='article_id', how='left') # merge price back in 
    
    i = 0
    customers['group'] = 0
    intervals = pd.qcut(customers['age'], n)
    dfg = customers.groupby([intervals, "FN", "Active"])
    
    recs = []
    for x in dfg.groups:  
        gg = dfg.get_group(x).copy()
        gg['group'] = i
        gg['mean_age'] = gg['age'].mean()
        tb, ids = test_bucket_prep(gg, encoded_articles)
        
        pred = model.predict(tb)
        
        recs.append(get_bucket_rec(pred,ids,12))
        i += 1
        
    return pd.concat(recs)

# prep the articles for each individual bucket
# Input: df=split out bucket, encoded_articles=copy of all binary encoded articles 
def test_bucket_prep(df, encoded_articles):
    cust = encoded_articles.copy()
    
    cust['mean_age'] = df['mean_age'].iloc[0]
    cust['FN'] = df['FN'].iloc[0]
    cust['Active'] = df['Active'].iloc[0]
    cust['group'] = df['group'].iloc[0]

    ids = cust[['group', 'article_id']].copy() # create a frame to easily remap customers and articles  
    
    #cust = cust.merge(customers[customer_attributes].set_index('customer_id'), on='customer_id', how='left')
    cust = cust[['mean_age', 'FN', 'Active', 'price']+[c for c in cust if c not in ['mean_age', 'FN', 'Active', 'price']]]
    
    return cust.drop(['group', 'article_id'], axis=1), ids

# for an individual bucket, get a recommendation 
def get_bucket_rec(df, ids, n): 
    
    pred = pd.DataFrame(df, columns=['rating']).copy()
    top12 = pred.sort_values(["rating"], ascending=False).head(n)
    joined = ids.reset_index().merge(top12, left_on='index', right_on=top12.index, how='inner')
    rec = pd.DataFrame(joined.groupby(by='group').apply(agg_articles)).reset_index().rename(columns={0:'prediction'})
    
    return rec 
    


In [6]:
"""
def run_model(x, y, n): 
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=n)
    xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_jobs=13,random_state=n,n_estimators=100)
    
    xgb_model.fit(X_train, y_train, 
              verbose=True, 
              eval_metric = 'rmse')
              
    
X_all, y_all, de = train_data_prep(df, customers, customer_attributes, "customer_id", 'article_id', "ratings")
X_all.head()
"""    


KeyError: 'ratings'

# Prepare Data for Training

In [265]:
X_all, y_all = train_buckets_prep(nums, target, ['article_id'])
X_all.head()

Unnamed: 0,FN,Active,mean_age,price,article_id_b_1,article_id_b_2,article_id_b_3,article_id_b_4,article_id_b_5,article_id_b_6,...,article_id_b_8,article_id_b_9,article_id_b_10,article_id_b_11,article_id_b_12,article_id_b_13,article_id_b_14,article_id_b_15,article_id_b_16,article_id_b_17
0,0.0,0.0,22.242112,0.008458,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,0.0,22.242112,0.008458,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0.0,0.0,22.242112,0.005068,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,0.0,0.0,22.242112,0.022864,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.0,0.0,22.242112,0.025407,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [266]:
y_all.head()

Unnamed: 0,rating
0,0.115385
1,0.0
2,0.0
3,0.076923
4,0.0


# Train Model and Evaluate

In [272]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.33, random_state=42)

In [273]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_jobs=13,random_state=42,n_estimators=100)

In [274]:
xgb_model.fit(X_train, y_train, 
              verbose=True, 
              eval_metric = 'rmse')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=13,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [275]:
pred1 = xgb_model.predict(X_test)

In [276]:
test_pred_rms(pred1, y_test)

0.01857851364855305

# Prepare Submission


In [277]:
gm = grouper(customers,nums)
gm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gg['group'] = i


Unnamed: 0,customer_id,group
1152,003667a105459b8e9aaa0a9434a2f48e2d9fba489da123...,0
11359,021c897da6d36da705952b4ecc46e641b811e094d67f68...,0
12395,024f985e61d949356e0e6fab6f59a669fc779d9f6c11fe...,0
14681,02ba549fb1b6e18685703a22a819af82abd4a0e2b2dc82...,0
14821,02c17f123da8edcc6af4662151670c874fb991ce76f7a9...,0
...,...,...
1371905,fffbf3d9ceedbdfa3e41b8851b8e3ae21267befac31d00...,15
1371920,fffcb073cfbea83431228195820e21c82b56905b3ee1eb...,15
1371938,fffdaa06e7f3e9698fb1df460b03ca6cc56528b98982c5...,15
1371947,fffe61b99c2d0418ed22190a8490b142247e8897c67941...,15


In [278]:
sam = sample['customer_id'].to_frame().merge(gm.set_index('customer_id'), on='customer_id', how='inner')
sam.head()

Unnamed: 0,customer_id,group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,9
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,5
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,1
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,13
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,15


In [284]:
recs = test_all_prep(nums, xgb_model)
recs

Unnamed: 0,group,prediction
0,0,0722437003 0741356002 0783707028 0783707004 07...
0,1,0722437003 0741356002 0783707028 0783707004 07...
0,2,0156231001 0111586001 0722437003 0673214001 07...
0,3,0156231001 0111586001 0722437003 0673214001 07...
0,4,0722437003 0741356002 0783707028 0783707004 07...
0,5,0722437003 0741356002 0783707028 0783707004 07...
0,6,0156231001 0111586001 0722437003 0673214001 07...
0,7,0156231001 0111586001 0722437003 0673214001 07...
0,8,0722437003 0741356002 0783707028 0783707004 07...
0,9,0722437003 0741356002 0783707028 0783707004 07...


In [285]:
sub = sam.merge(recs.set_index('group'), how='left', on='group').drop(columns='group', axis=1)
sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0722437003 0741356002 0783707028 0783707004 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0722437003 0741356002 0783707028 0783707004 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0722437003 0741356002 0783707028 0783707004 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0722437003 0741356002 0783707028 0783707004 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0156231001 0111586001 0722437003 0673214001 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0722437003 0741356002 0783707028 0783707004 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0722437003 0741356002 0783707028 0783707004 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0156231001 0111586001 0722437003 0673214001 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0156231001 0111586001 0722437003 0673214001 07...


In [286]:
path = r"C:\Users\yiboz\Github\Spring 2022\CS554\Project\submissions" 

dt = datetime.now().strftime("%d-%m-%Y_%H")
compression_opts = dict(method='zip', archive_name='submission.csv')  
sub.to_csv(path+r"\submission"+dt+".zip", index=False, compression=compression_opts)  

In [None]:
"""
Grouping buckets testing code 

buckets = pd.qcut(trans_custs['age'], 4)
cust_map = tc.groupby([buckets, "FN", "Active"] )['customer_id'].unique()
cust_frame = pd.DataFrame(cust_map)
pd.DataFrame(cust_map.values.tolist()).transpose()
"""