In [1]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np
from src.config import BaseConfig
from src.modeling.metrics import weighted_accuracy
from tqdm import tqdm
import gc

from sentence_transformers.cross_encoder import CrossEncoder

config = BaseConfig()

In [2]:
path = config.processed_data_dir / "final_data_to_train.json"
df = pd.read_json(path)

df.head(0)

Unnamed: 0,assignee_level_order,jira_key,weeks_since_member_join,time_to_complete_hours,task_text


# Simple architecture


1. Skip 15 first tasks – it will be used for example for future
2. For each task get 5 most relevant by Cross Encoder.
3. Update corpus for each task by time (jira_key is enough to order)
4. Make 5 more columns with their time_to_complete_hours
5. Based on these features make simple prediction

# Data preprocessing

In [3]:
df = df.sort_values(by=["jira_key"])
df = df.reset_index(drop=True)
df["time_to_complete_hours"] = df["time_to_complete_hours"].clip(1, 16)  # clip between 2 and 24 hours

corpus_col = []

for i in range(df.shape[0]):
    corpus_col.append(df.iloc[:i][["jira_key", "task_text", "time_to_complete_hours"]].to_dict(orient="records"))

In [4]:
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")

In [5]:
reranked_tasks = []
N_SIMILAR_TASKS = 5
LAST_N_TASKS = 100  # Make ranking from last 100 tasks only

for i in tqdm(range(len(corpus_col))):

    if i < 15:
        reranked_tasks.append({})
        continue

    query = df["task_text"].iloc[i]

    # We guarantee that corpus sorted by jira_key = time of task
    corpus = corpus_col[i][-LAST_N_TASKS:]
    ranks = model.rank(query, pd.DataFrame(corpus).task_text.tolist())
    ranks = sorted(ranks, key=lambda x: x["score"], reverse=True)
    ranks = ranks[:N_SIMILAR_TASKS]
    ranks = list(map(lambda x: x["corpus_id"], ranks))

    tasks = [
        {
            "corpus": corpus[i],
            "ranks": ranks[i]
        } for i in range(len(ranks))
    ]

    reranked_tasks.append(
        {
            "ranks": ranks,
            "rel_tasks": tasks
        }
    )

    gc.collect()


100%|██████████| 487/487 [17:18<00:00,  2.13s/it] 


In [15]:
processed = []

for i, task in enumerate(df["jira_key"]):
    processed.append(
        {
            "task": task,
            "relevant_tasks": reranked_tasks[i].get("rel_tasks", {}),
        }
    )


In [16]:
import json

path = config.processed_data_dir / "final_data_to_train_w_relevant_previous_tasks.json"

path.write_text(json.dumps(processed, indent=4, ensure_ascii=False))

3847825

# Modeling

In [19]:
data_w_relevance = pd.read_json(path)
data_w_relevance.head(0)

Unnamed: 0,task,relevant_tasks


In [29]:
def flatten_previous_task(task_: dict) -> pd.Series:
    """
    Get from list of jsons such as [
        {
            "corpus": {
                "jira_key": "PRT-101",
                "task_text": "**Summary:** ...",
                "time_to_complete_hours": 2
            },
            "ranks": 12
        },
        {
            "corpus": {
                "jira_key": "PRT-102",
                "task_text": "**Summary:** ...",
                "time_to_complete_hours": 8
            },
            "ranks": 4
        }
    ] to columns with 2 and 8 – time_to_complete_hours

    :param task_:
    :return:
    """

    if not isinstance(task_, list):
        return pd.Series()

    hours = [t.get("corpus", {}).get("time_to_complete_hours") for t in task_]
    names = [f"prev_task_eta_{i}" for i in range(len(hours))]

    return pd.Series(hours, index=names)


flatten_previous_task(data_w_relevance.relevant_tasks[16])

prev_task_eta_0    2
prev_task_eta_1    8
prev_task_eta_2    8
prev_task_eta_3    4
prev_task_eta_4    8
dtype: int64

In [32]:
data_w_relevance_flatten = data_w_relevance.relevant_tasks.apply(flatten_previous_task)
data_w_relevance_flatten["jira_key"] = data_w_relevance["task"]

In [61]:
path = config.processed_data_dir / "final_data_to_train.json"
test_keys_path = config.processed_data_dir / "test_tasks.csv"
df = pd.read_json(path)
test_keys = pd.read_csv(test_keys_path)

df["time_to_complete_hours_transformed"] = df["time_to_complete_hours"].clip(2, 12)  # clip between 2 and 24 hours
df["time_to_complete_hours_transformed"] = df["time_to_complete_hours_transformed"] // 2 * 2  # round to the nearest even number

In [63]:
x = df[["jira_key", "assignee_level_order", "weeks_since_member_join"]].copy()

y = df["time_to_complete_hours_transformed"]

x = x.merge(data_w_relevance_flatten, on="jira_key", how="left")
x = x.drop(columns=["jira_key"])

y_test_from_manual = y[~df.jira_key.isin(test_keys.jira_key)].copy()
x_test_from_manual = x[~df.jira_key.isin(test_keys.jira_key)].copy()

y = y[~df.jira_key.isin(test_keys.jira_key)]
x = x[~df.jira_key.isin(test_keys.jira_key)]

y.value_counts().sort_index()

y = y[x.notna().all(axis=1)]
x = x[x.notna().all(axis=1)]

In [66]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LinearRegression())
])

metrics = {
    "w_acc_test_transformed": [],
    "w_acc_train_transformed": [],
}
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

for train_index, test_index in skf.split(x, (y // 2) * 2):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipe.fit(x_train, y_train)

    # evaluation
    keys = ["w_acc_test_transformed", "w_acc_train_transformed"]
    xs = [x_test, x_train]
    ys = [y_test, y_train]
    for key, x_, y_ in zip(keys, xs, ys):
        y_pred = pipe.predict(x_)
        metrics[key].append(weighted_accuracy(y_, y_pred))

metrics = {
    k: {
        "mean": np.mean(v),
        "std": np.std(v)
    }
    for k, v in metrics.items()
}

pd.DataFrame(metrics)

Unnamed: 0,w_acc_test_transformed,w_acc_train_transformed
mean,0.148576,0.162667
std,0.007545,0.004045


In [69]:
pipe.fit(x, y)
y_pred = pipe.predict(x_test_from_manual.fillna(0))
weighted_accuracy(y_test_from_manual, y_pred)

0.17260857073161806

# Результаты
Такой робастный способ показал плохие результаты – на 10 пп ниже, чем baseline