In [1]:
import pandas as pd
import polars as pl
from lightgbm.sklearn import LGBMRanker


In [3]:
train = pl.read_parquet("../input/otto-train-and-test-data-for-local-validation/test.parquet")
train_labels = pl.read_parquet("../input/otto-train-and-test-data-for-local-validation/test_labels.parquet")
train = train.to_pandas()
train["action_num_reverse_chrono"] = train.groupby("session").cumcount(ascending=False)
session_length = train.groupby("session").size().to_frame().rename(columns={0: "session_length"}).reset_index()
train = train.merge(session_length, on="session")
linear_interpolation = 0.1 + ((1 - 0.1) / (train["session_length"] - 1)) * (train["session_length"] - train["action_num_reverse_chrono"] - 1)
train["log_recency_score"] = 2 ** linear_interpolation - 1
train["log_recency_score"].fillna(1, inplace=True)
type_weights = {0: 1, 1: 6, 2: 3}
train["type_weighted_log_recency_score"] = train["type"].apply(lambda x: type_weights[x]) * train["log_recency_score"]
type2id = {"clicks": 0, "carts": 1, "orders": 2}
train_labels = train_labels.to_pandas()
train_labels = train_labels.explode("ground_truth")
train_labels["aid"] = train_labels["ground_truth"]
train_labels["type"] = train_labels["type"].apply(lambda x: type2id[x])
train_labels = train_labels[["session", "type", "aid"]]
train_labels["gt"] = 1
train = train.merge(train_labels, how="left", on=["session", "type", "aid"])
train["gt"].fillna(0, inplace=True)
train["gt"] = train["gt"].astype(int)
train["aid"] = train["aid"].astype(int)
session_lengths_train = session_length["session_length"].values
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=20,
    importance_type="gain",
)
feature_cols = ["aid", "type", "action_num_reverse_chrono", "session_length", "log_recency_score", "type_weighted_log_recency_score"]
target = "gt"

In [15]:
train.head()

Unnamed: 0,session,aid,ts,type,action_num_reverse_chrono,session_length,log_recency_score,type_weighted_log_recency_score,gt
0,11098528,11830,1661119200,0,0,1,1.0,1.0,0
1,11098529,1105029,1661119200,0,0,1,1.0,1.0,1
2,11098530,264500,1661119200,0,5,6,0.071773,0.071773,0
3,11098530,264500,1661119288,0,4,6,0.214195,0.214195,0
4,11098530,409236,1661119369,0,3,6,0.375542,0.375542,0


In [5]:
ranker = ranker.fit(
    train[feature_cols],
    train[target],
    group=session_lengths_train,
)
# test
test = pl.read_parquet("../input/otto-full-optimized-memory-footprint/test.parquet")
test = test.to_pandas()
test["action_num_reverse_chrono"] = test.groupby("session").cumcount(ascending=False)
session_length = test.groupby("session").size().to_frame().rename(columns={0: "session_length"}).reset_index()
test = test.merge(session_length, on="session")
linear_interpolation = 0.1 + ((1 - 0.1) / (test["session_length"] - 1)) * (test["session_length"] - test["action_num_reverse_chrono"] - 1)
test["log_recency_score"] = 2 ** linear_interpolation - 1
test["log_recency_score"].fillna(1, inplace=True)
type_weights = {0: 1, 1: 6, 2: 3}
test["type_weighted_log_recency_score"] = test["type"].apply(lambda x: type_weights[x]) * test["log_recency_score"]
scores = ranker.predict(test[feature_cols])

In [14]:
test[test["session"] == 12899780]

Unnamed: 0,session,aid,ts,type,action_num_reverse_chrono,session_length,log_recency_score,type_weighted_log_recency_score
1,12899780,1142000,1661724000,0,4,5,0.071773,0.071773
2,12899780,582732,1661724058,0,3,5,0.252664,0.252664
3,12899780,973453,1661724109,0,2,5,0.464086,0.464086
4,12899780,736515,1661724136,0,1,5,0.71119,0.71119
5,12899780,1142000,1661724155,0,0,5,1.0,1.0


In [9]:
test.head()

Unnamed: 0,session,aid,ts,type,action_num_reverse_chrono,session_length,log_recency_score,type_weighted_log_recency_score
0,12899779,59625,1661724000,0,0,1,1.0,1.0
1,12899780,1142000,1661724000,0,4,5,0.071773,0.071773
2,12899780,582732,1661724058,0,3,5,0.252664,0.252664
3,12899780,973453,1661724109,0,2,5,0.464086,0.464086
4,12899780,736515,1661724136,0,1,5,0.71119,0.71119


In [12]:
test[test["session"] == 12899780].index

Int64Index([1, 2, 3, 4, 5], dtype='int64')

In [7]:
scores

array([-0.12225094, -0.38638347, -0.20556515, ..., -0.12225094,
       -0.12225094, -0.12225094])

In [44]:
cv = pd.read_csv("../output/youthful-yogurt-27/cv/full_columns_validation_preds.csv")
cv["session"] = cv["session_type"].apply(lambda x: x.split("_")[0])
cv["type"] = cv["session_type"].apply(lambda x: x.split("_")[1])
type2id = {"clicks": 0, "carts": 1, "orders": 2}
cv["type"] = cv["type"].apply(lambda x: type2id[x])
cv["top_n"] = cv["top_n"].apply(lambda x: x.split(" "))
cv.head()

Unnamed: 0,session_type,labels,top_n,top,session,type
0,11098528_clicks,11830 588923 1732105 571762 884502 1157882 876...,"[11830, 588923, 1732105, 571762, 884502, 11578...",,11098528,0
1,11098529_clicks,1105029 459126 1339838 1544564 217742 1694360 ...,"[1105029, 459126, 1339838, 1544564, 217742, 16...",,11098529,0
2,11098530_clicks,409236 264500 1603001 963957 254154 583026 167...,"[409236, 264500, 1603001, 963957, 254154, 5830...",,11098530,0
3,11098531_clicks,396199 1271998 452188 1728212 1365569 624163 1...,"[396199, 1271998, 452188, 1728212, 1365569, 62...",,11098531,0
4,11098532_clicks,876469 7651 108125 1202618 1159379 77906 17040...,"[876469, 7651, 108125, 1202618, 1159379, 77906...",,11098532,0


In [46]:
cv = cv[["session", "type", "top_n"]]
cv = cv.explode("top_n")

In [53]:
cv

Unnamed: 0,session,type,top_n
0,11098528,0,11830
0,11098528,0,588923
0,11098528,0,1732105
0,11098528,0,571762
0,11098528,0,884502
...,...,...,...
5403752,12899778,1,889112
5403752,12899778,1,13568
5403752,12899778,1,1097813
5403752,12899778,1,1102546


In [58]:
cv_click = cv[cv["type"] == 0]

In [60]:
cv_click.shape

(35738415, 3)

In [63]:
cv_click["covisit_clicks_candidate_num"] = cv_click.groupby("session").cumcount() + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_click["covisit_clicks_candidate_num"] = cv_click.groupby("session").cumcount() + 1


In [66]:
cv_click.head(25)

Unnamed: 0,session,type,top_n,covisit_clicks_candidate_num
0,11098528,0,11830,1
0,11098528,0,588923,2
0,11098528,0,1732105,3
0,11098528,0,571762,4
0,11098528,0,884502,5
0,11098528,0,1157882,6
0,11098528,0,876129,7
0,11098528,0,1182614,8
0,11098528,0,1790438,9
0,11098528,0,307904,10
