In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pip install lightgbm

In [None]:
pip install catboost

In [14]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report

import xgboost as xgb

import catboost as cat



In [3]:
df_order_jan = pd.read_csv("orders_before_jan_labeled.csv")
df_order_jan['label'].value_counts()

0    882569
4      8883
3      6846
2      4522
1      3061
Name: label, dtype: int64

In [4]:
# unbalanced data remove extra 0s 
df_10000_0 = df_order_jan.loc[df_order_jan['label']==0].sample(n=10000, axis=0, random_state=1)
df_balanced_jan = df_10000_0.append(df_order_jan.loc[df_order_jan['label']!=0])
df_balanced_jan['label'].value_counts()

0    10000
4     8883
3     6846
2     4522
1     3061
Name: label, dtype: int64

In [5]:
# load other dataset
df_cat_hierarchy = pd.read_csv('category_hierarchy.csv', delimiter='|',error_bad_lines=False)
df_items = pd.read_csv('items.csv',sep='|',error_bad_lines=False)
# fill NA values in category column -- 4300 is the category with null values
df_items["categories"] = df_items["categories"].fillna('[4300]')
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]"
1,28640,1366,10,1,537,0,101,[4300]
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3..."
3,21399,1090,10,1,511,0,0,[3270]
4,8504,768,4,1,484,0,66,[2470]
...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,..."
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]"
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]"


In [6]:
# transform categories from str to int list
df_items['categories'] = df_items['categories'].map(lambda x: list(map(int,x.split("[")[1].split("]")[0].split(","))))

In [7]:
# from categories list generate parent_categories
df_indexed = df_cat_hierarchy.set_index(['category'])
df_indexed

Unnamed: 0_level_0,parent_category
category,Unnamed: 1_level_1
0,75
1,1499
2,1082
3,3498
4,1623
...,...
4295,3898
4296,3898
4297,3898
4298,3898


In [8]:
df_indexed.loc[[2890, 855, 3908, 3909]]['parent_category']

category
2890    2832
855     1178
3908    3898
3909    3898
Name: parent_category, dtype: int64

In [9]:
# set parent of null as null
df_indexed.loc[4300] = [4300]
df_indexed

Unnamed: 0_level_0,parent_category
category,Unnamed: 1_level_1
0,75
1,1499
2,1082
3,3498
4,1623
...,...
4296,3898
4297,3898
4298,3898
4299,3898


In [10]:
# parent_categories generate
df_items['parent_categories'] = df_items['categories'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())

In [11]:
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]"
1,28640,1366,10,1,537,0,101,[4300],[4300]
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ..."
3,21399,1090,10,1,511,0,0,[3270],[1420]
4,8504,768,4,1,484,0,66,[2470],[2566]
...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,..."
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]"
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]"


In [25]:
# Joining df_orders_jan dataframe and df_items dataframe
merge_jan = pd.merge(df_items, df_balanced_jan, how='inner', on='itemID')
#change the date coulumn's datayte to datetime datatype
merge_jan['date'] = pd.to_datetime(merge_jan['date'])
merge_jan = merge_jan.set_index('date')

In [55]:
merge_jan

Unnamed: 0_level_0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,userID,order,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-12-08,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...",36624,1,0
2020-12-06,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...",27527,1,0
2020-10-18,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...",7271,1,1
2020-11-12,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...",33269,1,2
2020-12-19,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...",39232,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-06,1128,827,6,0,364,0,132,[440],[3383],7765,1,1
2020-11-25,13822,993,4,0,491,0,144,[248],[307],11606,1,4
2020-08-14,4183,96,4,1,474,0,-1,"[3669, 3867]","[2703, 1543]",41747,1,0
2020-11-18,4183,96,4,1,474,0,-1,"[3669, 3867]","[2703, 1543]",21102,1,1


In [29]:
merge_jan_no_cat = merge_jan.drop(columns=['categories', 'parent_categories'])

In [30]:
y = merge_jan_no_cat['label']
X = merge_jan_no_cat.drop(columns='label')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train

Unnamed: 0_level_0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,userID,order
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-11-23,17749,147,4,0,491,3,66,31095,1
2020-07-11,11250,1383,10,0,502,0,178,44941,1
2020-10-03,4421,1496,4,0,114,3,16,3634,4
2020-12-14,31953,1040,10,0,236,0,84,39144,1
2020-11-23,11118,110,4,1,282,0,16,26235,2
...,...,...,...,...,...,...,...,...,...
2020-07-16,22606,615,10,0,385,0,147,23136,1
2020-08-10,3612,1065,4,0,536,3,144,1642,1
2020-12-28,21625,1111,4,0,487,0,144,29100,2
2020-11-11,10984,174,4,0,452,0,12,2389,2


In [46]:
# upload submission dataset
sub_jan = pd.read_csv("submission_jan.csv", sep="|")
sub_jan

Unnamed: 0,userID,itemID,prediction
0,0,20664,
1,0,28231,
2,13,2690,
3,15,1299,
4,15,20968,
...,...,...,...
9935,46118,20106,
9936,46124,19677,
9937,46125,12878,
9938,46127,7963,


In [32]:
# baseline(didn't consider time series)--xgboost
def xgb_train(X_train, y_train, X_test, y_test, verbose=True):
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=1000,
        min_child_weight=300, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42        
    )

    model_xgb.fit(
        X_train, 
        y_train,
        eval_metric='auc',
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=verbose,
        early_stopping_rounds=10 
    )
    print(model_xgb.best_score)
    return model_xgb

In [33]:
model_xgb = xgb_train(X_train, y_train, X_test, y_test, verbose=False)

0.6208118915276252


In [53]:
pre_merge = pd.merge(sub_jan.drop(columns='prediction'), X, on=['userID','itemID'], how="inner")
prediction = model_xgb.predict(pre_merge)
sub_xgb = sub_jan.copy()
sub_xgb['prediction'] = pd.Series(prediction)
# user and items haven't shown up predict 0
sub_xgb['prediction'] = sub_xgb['prediction'].fillna(0)
# submission.to_csv('submission_xgb.csv', index=False)
sub_xgb['prediction'].value_counts()

0.0    8394
3.0     848
4.0     698
Name: prediction, dtype: int64

In [54]:
sub_xgb.to_csv('submission_xgb.csv')

In [41]:
# baseline1: fill 1
sub_jan_all1 = sub_jan.copy()
sub_jan_all1['prediction'] = sub_jan['prediction'].fillna(1)
sub_jan_all1.to_csv('submission_all1.csv')


In [74]:
# gold dataset
result_for_jan = pd.merge(sub_jan.drop(columns='prediction'), merge_jan, on=['userID','itemID'], how="inner")
gold = result_for_jan[['userID','itemID','label']]
gold.rename(columns={'label':'prediction'}, inplace=True)

In [75]:
gold

Unnamed: 0,userID,itemID,prediction
0,0,28231,3
1,15,1299,4
2,15,20968,4
3,34,21146,4
4,61,4648,2
...,...,...,...
3785,46096,6923,4
3786,46103,31715,3
3787,46107,11349,0
3788,46124,19677,4


In [76]:
def count_points(pred, gold):
    df = pd.merge(pred, gold, on=['userID', 'itemID'], suffixes=('_pred', '_gold'))
    df['points'] = df.apply(_compute_points_for_row, axis=1)
    return df['points'].sum()

def _compute_points_for_row(row):
    y_pred, y_gold = row.prediction_pred, row.prediction_gold
    if y_pred == y_gold:
        # one point if "no order" (0) is predicted correctly; three points if order week is predicted correctly
        return 1 if y_pred == 0 else 3
    # one point if order is predicted correctly (but not the correct week), otherwise zero points
    return 1 if (y_pred > 0 and y_gold > 0) else 0

In [77]:
# bonus for all 1 model
pred, gold = sub_jan_all1, gold  # TODO: load your prediction and goldstandard

points = count_points(pred, gold)
max_points = count_points(gold, gold)
score = points / max_points
print(points)
print(max_points)
print(score)

4104
25272
0.1623931623931624


In [78]:
# bonus for xgboost model
pred, gold = sub_xgb, gold  # TODO: load your prediction and goldstandard

points = count_points(pred, gold)
max_points = count_points(gold, gold)
score = points / max_points
print(points)
print(max_points)
print(score)

1118
25272
0.044238683127572016


In [68]:
# baseline: random forest
# from sklearn.metrics import accuracy_score

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)
# rf_pred=rf.predict(X_test)
# rf_acc=rf.score(X_test, y_test)
# rf_acc

0.3505928260543299