In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

## Baseline на `train.csv`

In [2]:
# test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
# submission = pd.read_csv("submission.csv")

In [3]:
# len(set(train["ego_id"]))

In [4]:
ego_ids = list(train["ego_id"].unique())
# train_ids = ego_ids[:50000]
val_ids = ego_ids[50000:]

In [5]:
test = train.loc[
    train.loc[:, "ego_id"].isin(val_ids)
]

In [6]:
%%time

test["test_x1"] = np.nanmean(test["x1"].values)

CPU times: total: 93.8 ms
Wall time: 96.1 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
test.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,test_x1
99158214,1382979469777,77,79,,0.53926,0.0,0.0,0.691446
99158215,1382979469777,111,0,78.4,0.907624,0.0,1.0,0.691446
99158216,1382979469777,168,24,122.6,1.8e-05,0.0,0.0,0.691446
99158217,1382979469777,190,60,,0.0,0.0,1.0,0.691446
99158218,1382979469777,91,95,,0.0,0.0,1.0,0.691446


In [8]:
def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [9]:
print("Dummy model RMSE: {}".format(RMSE(test["x1"], test["test_x1"])))

Dummy model RMSE: 1.3529889688925072


In [None]:
# ego_id_list = test["ego_id"].drop_duplicates().values
# ego_id_list

array([1382979469777, 1382979469780, 1382979469784, ..., 1709396984686,
       1709396984691, 1709396984692], dtype=int64)

In [10]:
submission = test.copy()

# Загрузка данных

В бейзлайне реализована простейшая модель на эго-графе.

Для каждой пары вершин u и v эго-графа попытаемся найти всех общих "друзей" w. Силой связи между вершинами u и v будем считать средную силу связи между ними и общими друзьями.

Поскольку обучение для такой модели не требуется, будем пользоваться только тестовой выборкой.

In [None]:
%%time

test = pd.read_csv("test.csv")
submission = pd.read_csv("submission.csv")

CPU times: total: 13.6 s
Wall time: 13.6 s


In [None]:
submission.head()

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.0
1,8,0,143,0.0
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857


In [None]:
test.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,8,20,19,185.7,0.0003839089,0.0,0.0
1,8,131,125,161.4,0.4034464,0.0,0.0
2,8,73,56,127.0,8.554643e-05,0.0,0.0
3,8,0,4,594.5,0.2886418,0.0,0.0
4,8,63,73,127.0,4.281692e-07,0.0,0.0


Проверочные данные - недоступны участникам, используются для подсчёта метрики:

In [None]:
control = pd.read_csv("control.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'control.csv'

Таблицы control и submission отличаются только последним столбцом x1. В таблице control в этом столбце содержатся истинные значения связей x1.

In [None]:
control[["ego_id", "u", "v"]].equals(submission[["ego_id", "u", "v"]])

True

Таблица submission отсортирована по возрастанию ego_id, u, v:

In [None]:
submission.equals(submission.sort_values(["ego_id", "u", "v"]))

True

# Модель

In [None]:
submission["ego_id"].value_counts()

ego_id
901943132599     349
721554506143     328
1400159338751    298
1039382085802    270
575525618423     262
                ... 
1142461301166      1
515396076193       1
515396076373       1
51539608193        1
1245540516719      1
Name: count, Length: 20586, dtype: int64

Константное предсказание:

In [None]:
%%time

# submission_dummy = submission.copy()
train["test_x1"] = np.nanmean()
submission_dummy["x1"] = np.nanmean(test["x1"].values)

CPU times: total: 281 ms
Wall time: 255 ms


In [None]:
%%time

from tqdm import tqdm


ego_id_list = submission["ego_id"].drop_duplicates().values
for ego_id in tqdm(ego_id_list):
    submission_ego_net = submission[submission["ego_id"] == ego_id]
    test_ego_net = test[test["ego_id"] == ego_id]
    friendship = np.zeros_like(submission_ego_net["x1"].values)
    for i, (u, v) in enumerate(zip(submission_ego_net["u"], submission_ego_net["v"])):
        u_x1 = test_ego_net.loc[test_ego_net["u"] == u, ["v", "x1"]].dropna()
        v_x1 = test_ego_net.loc[test_ego_net["u"] == v, ["v", "x1"]].dropna()
        common_friends = u_x1.merge(v_x1, on="v")
        if common_friends.shape[0] > 0:
            friendship[i] = np.mean(common_friends.drop("v", axis=1).values)
    submission.loc[submission["ego_id"] == ego_id, "x1"] = friendship

  6%|▌         | 1237/20586 [02:22<37:03,  8.70it/s]  


KeyboardInterrupt: 

# Посчёт метрик

In [None]:
def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [None]:
print("Dummy model RMSE: {}".format(RMSE(control["x1"], submission_dummy["x1"])))

Dummy model RMSE: 1.3604058861047796


In [None]:
print("Baseline model RMSE: {}".format(RMSE(control["x1"], submission["x1"])))

Baseline model RMSE: 1.353040933001075
