In [1]:
import sys
import pandas as pd
import numpy as np
import plotly.express as px
import xgboost as xgb

from sklearn import preprocessing
from sklearn.model_selection import train_test_split



In [262]:
shared_drive = r"G:\.shortcut-targets-by-id\184tVjsIO-GAjbkSakwDbEZ40M5mPpgu4\Capstone\cleaned_data"
google_drive = r"G:\My Drive\Spring_2022\CS554\Project\data"
drive = r"D:\Users\yiboz\Programming\Github\CS554\data"
articles = pd.read_csv(shared_drive+r"\articles_clean.csv", dtype={'article_id':str})
customers = pd.read_csv(shared_drive+r"\customers_clean.csv", dtype={'customer_id':str})
transactions = pd.read_csv(shared_drive+r"\transactions_train.csv", dtype={'article_id':str, 'customer_id':str})
sample = pd.read_csv(shared_drive+r"\sample_submission.csv")

In [252]:
sample.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [256]:
sample['prediction'][0]

'0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001'

NOTE: Look back again on how the Movie reccomendation rating thing worked, might need 100k+ columns for 1.3mil customers (looking at possibly 10BIL+ rows), regression rating on each. Definetely need pyspark. Using ratings to rank their top 12 preferences 


Questions: 
    
    1. How to use a list of article_ids as outputs? 
    2. How to get a list of predictions of article_ids? 
    
    3. How are we supposed to use the transaction date values? Do we want to use it? 
    4. What are the significant columns in the articles table we wanted to use? Need to use Chi-square to test. 
    5. Articles have a high cardinality and I need to reduce their dimension? What is the best way to encode the article categories? 
        - https://pbpython.com/categorical-encoding.html 
    6. What is the type of regressor I want to use for my ranking data? 


The Plan: Need to use regression on ratings of all articles? (just use articles from Transactions)  
    
    Ratings: Get value_counts of each article_id bought and normalize by taking (Max_frequency - Min_frequency) (need pivot table). Answers questions 1 and 2. 
    
    Encoding: Categorical vs Binary due to High Cardinality of article_id 
    
    Buckets: Group customers into buckets of close age brackets (possibly based on the distribution) and other characteristics 
    
    Reduction of Articles: drop the bottom percent of least popular articles
    
    Reduction of Customers: use only a small percentage of customers to train/test the data, then get recommendations on everyone else 
    
    Regressor: Try 'reg:squarederror', need to look into rank:pairwise and rank:map
    
    Evaluation: default set by objective function, ex. 'rmse'. 

TODO: figure out the most significant articles columns, all categorical so use Chi-squared test 

X-values (did we ever figure out the most significant columns for all of the articles?): ['customer_id', 'age', 'FN', 'Active']

    Need to seperate customer and article characteristics for a hybrid approach, possibly combine afterwards 

Y-values: ratings on all articles 

Observations: 

    1. The data is horribly imbalanced, with thousands of users having only purchased one item 

In [3]:
# index group is the smallest at 5 different categories 
mini = sys.maxsize
col = ''
for i in articles.columns: 
    n = articles[i].unique().size
    if mini > n: 
        mini = n
        col = i
print("col: "+col+" size: "+ str(mini))


col: index_group_no size: 5


In [264]:
customer_attributes = ['age', 'FN', 'Active']
article_attributes = ['price', 'index_group_name']
target = "rating"

In [265]:
df = transactions.sample(frac=0.01)
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
14603869,2019-07-27,0e5b3379c1c3fc68cd905070104f6aab53c493299e893a...,531615001,0.011847,2
4901204,2019-01-12,74bd41da228573d992e2e3cf26814e7817667b4d09d3cd...,704629002,0.047441,2
8411446,2019-04-06,b1eda66468fb21b6f29022c341a15c1c2dfc080ba0aae3...,732429001,0.094898,2
31739575,2020-09-21,8156c342e1c0c61eb1e603a74fb7788ef32204538c0ae4...,874754003,0.033881,2
24837418,2020-04-20,0e8e816a0c019b94a174776e7d6d30520d7a140b2204d8...,837306003,0.033881,2


In [266]:
# returns the number of features necessary for binary encoding 
def binary_features(n): 
    return np.ceil(np.log(n+1)/np.log(2))

def purchase_count(df, customer="customer_id", projectPath=False): # from Xingeng 
    customArticlePair = df[[customer, "article_id"]].copy()
    # customArticlePair.loc[:,"rating"] = pd.Series([1 for x in range(len(customArticlePair))]) # Add a column of purchase count
    customArticlePair["count"] = 1
    # customArticlePair.to_csv("/content/gdrive/MyDrive/Capstone/co_filter/customer_article_pair.csv",index=False)
    countGroup = customArticlePair.groupby([customer, "article_id"]).count() # count purchase numbers
    
    if projectPath != False: 
        countGroup.to_csv(f"{projectPath}/customer_article_count.csv") # Save Counts to csv file
    return countGroup.reset_index()

def pivot_norm_purchase_count(df, customer="customer_id"): # from Xingeng 
    df_matrix = pd.pivot_table(df, values='rating', index=customer, columns='article_id')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    # print(d.head())
    # d.index.names = ['scaled_purchase_freq']
    # return pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
    
    return d

# get ratings of each article for each customer by taking (article count for customer) / (max count for customer)
def norm_ratings(df): 
    tempmax = df.groupby('customer_id')['count'].max().rename("temp_max") # get max for each customer_id 
    temp = df.merge(tempmax, how='left', on='customer_id') # merge the data together 
    temp['rating'] = temp['count'] / temp['temp_max'] # get true rating 
    return temp.drop(['count', 'temp_max' ], axis=1)

def mapper(df, col, to_bin=False): 
    n = df[col].unique().size
    mp = dict.fromkeys(range(n*2))
    de = dict.fromkeys(range(n*2))
    j = 1 
    
    if to_bin:
        b = str(int(binary_features(n)))
        f = "{:0>"+b+"b}"
    
    for i in df[col].unique(): 
        if to_bin: 
            mp[i] = f.format(j)
            de[f.format(j)] = i
        else: 
            mp[i] = j
        
        j += 1   
    df[col+'_mapped'] = df[col].apply(lambda x: mp[x] )
    return df, de


# creating binary encoding for a column  
def binary_encoder(df, col): 
    encoding = df[col].to_frame().copy() 
    mapped, de = mapper(encoding, col, to_bin=True)
    
    f = lambda x: col+'_b_{}'.format(x + 1)
    binned = pd.DataFrame(mapped.pop(col+'_mapped').apply(list).values.tolist()).apply(pd.to_numeric).rename(columns=f)
    return mapped.join(binned), de
    

    
def data_prep(df, att_df, atts, att_id, cats, goal): 
    purchases = purchase_count(df)
    normed = norm_ratings(purchases)
    cust = att_df[[att_id]+atts].copy().set_index(att_id)
    merged_df = normed.merge(cust, how='left', on=att_id).drop(att_id, axis=1)
    
    encoded, de = binary_encoder(merged_df[cats].to_frame().copy(), cats) 
    
    merged_df = merged_df.join(encoded.drop(cats, axis=1))
    y_all = merged_df[goal].to_frame()
    X_all = merged_df.drop([goal, cats], axis=1)
    
    return X_all, y_all, de
    #return merged_df

def test_pred_rms(x, y): 
    pred = pd.DataFrame(x, columns=['pred'])
    
    y = y.reset_index()
    del y['index']
    
    result = pd.concat([pred, y], ignore_index=True, axis=1, join='inner')
    
    f = lambda x: 'p_{}'.format(x + 1)
    result = result.rename(columns=f)
    result['diff'] = result['p_1'] - result['p_2']
    result['square'] = result['diff'] ** 2 

    return np.sqrt(result['square'].mean()) 
    
    

In [268]:
X_all, y_all, de = data_prep(df, customers, customer_attributes, "customer_id", 'article_id', target)
X_all.head()

Unnamed: 0,age,FN,Active,article_id_b_1,article_id_b_2,article_id_b_3,article_id_b_4,article_id_b_5,article_id_b_6,article_id_b_7,article_id_b_8,article_id_b_9,article_id_b_10,article_id_b_11,article_id_b_12,article_id_b_13,article_id_b_14,article_id_b_15,article_id_b_16
0,24.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,52.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,32.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,56.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,56.0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [269]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.33, random_state=42)

In [270]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_jobs=13,random_state=42,n_estimators=100)

In [271]:
xgb_model.fit(X_train, y_train, 
              verbose=True, 
              eval_metric = 'rmse')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=13,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [272]:
pred1 = xgb_model.predict(X_test)

In [273]:
test_pred_rms(pred1, y_test)

0.024915172550973627

NEXT STEP: 
1. Process all users for their ratings on all articles of clothing 
2. Get articles with the top 12 ratings, decode the article info 
3. Create a submission, concat the articles into a string 



In [11]:
a = articles['article_id'].count()
b = transactions['article_id'].unique().size
a - b

In [14]:
transactions['customer_id'].value_counts()

be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee985513d9e8e53c6d91b    1895
b4db5e5259234574edfff958e170fe3a5e13b6f146752ca066abca3c156acc71    1441
49beaacac0c7801c2ce2d189efe525fe80b5d37e46ed05b50a4cd88e34d0748f    1364
a65f77281a528bf5c1e9f270141d601d116e1df33bf9df512f495ee06647a9cc    1361
cd04ec2726dd58a8c753e0d6423e57716fd9ebcf2f14ed6012e7e5bea016b4d6    1237
                                                                    ... 
63b70b71291668f0a63ade8e321fb3eccb80eba164f2087dad471de065f18e1f       1
950b172c36d169bf427545991fe66371f21a085799b44780fdcb2da6a3091613       1
7c284f13f4af9d6a53f97279381638ed0cb7afaa4fd4f3eaadc21993ea45fc69       1
62d49d0ae11a4f65fa31e354cb87f6b557ebec648e0e5e71435d2dd190d1ccc4       1
268eaa31a07d6f2f4f060bfcf32a660f3ea3dbb21ef14cd09fc2545f4e0b5c10       1
Name: customer_id, Length: 1362281, dtype: int64

In [15]:
big = 'be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee985513d9e8e53c6d91b'

In [25]:
u = transactions.loc[(transactions['customer_id'] == big)]['article_id'].count()
u

In [21]:
transactions.loc[(transactions['customer_id'] == big)]['article_id'].value_counts()

253448001    8
826500008    7
668956001    7
828934001    6
777099001    6
            ..
562245089    1
680262004    1
807064003    1
798579002    1
879291001    1
Name: article_id, Length: 1346, dtype: int64

In [8]:
df.article_id.value_counts()

706016001    480
706016002    404
372860001    321
610776002    286
372860002    255
            ... 
614423001      1
613666002      1
776087001      1
636868001      1
730683013      1
Name: article_id, Length: 53537, dtype: int64

In [10]:
df.customer_id.value_counts()

be1981ab818cf4ef6765b2ecaea7a2cbf14ccd6e8a7ee985513d9e8e53c6d91b    23
67931690bdf18d2e328854ae772cd5ce2505fdc11164693998b13e706db0bb56    21
a65f77281a528bf5c1e9f270141d601d116e1df33bf9df512f495ee06647a9cc    18
a76cf5ea515d09f22b7fe3e8ea3c1944316bd6264a90e26cef126242ef3c5e11    17
03d0011487606c37c1b1ed147fc72f285a50c05f00b9712e0fc3da400c864296    17
                                                                    ..
328476e33eb9a28207809832497cef3e7ea0e0d5b5f7f302c51fd24d63ae6653     1
6b6c4268ceba96e1858768dc56acb4b44c0be24b96aa50df0d90785629301374     1
f9a9fd743222c81e259912bc45900dff506f18168378f1bf80f9ffc4fe27ba4a     1
019841f5cdfcb8cffa470612d2ceb690d2b5374e728657edb8e437efcd43a000     1
7cd8624699da578691cd501bf0e9d04ed67b50f18cbbf2207722678c180f1e33     1
Name: customer_id, Length: 229955, dtype: int64