In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import numpy as np
from src.config import BaseConfig
from src.modeling.metrics import weighted_accuracy

config = BaseConfig()

In [74]:
path = config.processed_data_dir / "final_data_to_train.json"
test_keys_path = config.processed_data_dir / "test_tasks.csv"
df = pd.read_json(path)
test_keys = pd.read_csv(test_keys_path)

df.head(0)

Unnamed: 0,assignee_level_order,jira_key,weeks_since_member_join,time_to_complete_hours,task_text


In [75]:
test_keys.head(0)

Unnamed: 0,jira_key


## Regression type of task

In [76]:
df["time_to_complete_hours_transformed"] = df["time_to_complete_hours"].clip(2, 12)  # clip between 2 and 24 hours
df["time_to_complete_hours_transformed"] = df["time_to_complete_hours_transformed"] // 2 * 2  # round to the nearest even number

y = df["time_to_complete_hours_transformed"]
x = df["task_text"]

y = y[~df.jira_key.isin(test_keys.jira_key)]
x = x[~df.jira_key.isin(test_keys.jira_key)]

y_test_from_manual = y[~df.jira_key.isin(test_keys.jira_key)]
x_test_from_manual = x[~df.jira_key.isin(test_keys.jira_key)]

y.value_counts().sort_index()

time_to_complete_hours_transformed
2     179
4      98
6      23
8      77
10      3
12     73
Name: count, dtype: int64

In [77]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),  # ngram_range=(1, 2) means that we will use unigrams and big
    ("lr", LinearRegression())
])

metrics = {
    "w_acc_test_transformed": [],
    "w_acc_train_transformed": [],
    "w_acc_test": [],
    "w_acc_train": []
}
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipe.fit(x_train, y_train)

    # evaluation
    keys = ["w_acc_test_transformed", "w_acc_train_transformed", "w_acc_test", "w_acc_train"]
    xs = [x_test, x_train, x_test, x_train]
    ys = [y_test, y_train, df.loc[test_index, "time_to_complete_hours"], df.loc[train_index, "time_to_complete_hours"]]
    for key, x_, y_ in zip(keys, xs, ys):
        y_pred = pipe.predict(x_)
        metrics[key].append(weighted_accuracy(y_, y_pred))

metrics = {
    k: {
        "mean": np.mean(v),
        "std": np.std(v)
    }
    for k, v in metrics.items()
}

pd.DataFrame(metrics)

Unnamed: 0,w_acc_test_transformed,w_acc_train_transformed,w_acc_test,w_acc_train
mean,0.185708,0.999375,-0.164114,-0.310316
std,0.011371,0.000879,0.061957,0.036505


In [78]:
preds = pd.DataFrame(
    {
        "y_true": y_test_from_manual,
        "y_pred": pipe.predict(x_test_from_manual)
    }
)
preds.groupby("y_true")["y_pred"].mean()

y_true
2     2.938267
4     4.448426
6     5.903760
8     7.247483
10    9.006690
12    9.958204
Name: y_pred, dtype: float64

In [79]:
preds.corr()

Unnamed: 0,y_true,y_pred
y_true,1.0,0.825207
y_pred,0.825207,1.0


In [80]:
weighted_accuracy(preds["y_true"], preds["y_pred"])

np.float64(0.7272005731569047)

### Weights evaluation

In [81]:
words = pipe.named_steps["tfidf"].get_feature_names_out()
weights = pipe.named_steps["lr"].coef_

weights_regr = pd.DataFrame({"word": words, "weight": weights, "abs_weight": abs(weights)})
weights_regr = weights_regr.sort_values("abs_weight", ascending=False)
weights_regr.head(10)


Unnamed: 0,word,weight,abs_weight
9964,fraud,5.474864,5.474864
2187,api,5.326665,5.326665
13714,model,4.769572,4.769572
15826,pd002,4.294159,4.294159
6258,dag,4.032556,4.032556
16697,prev,3.984014,3.984014
1413,and,3.931909,3.931909
3887,calibration,3.914738,3.914738
13065,mapping,3.847538,3.847538
18027,rejection rules,-3.721543,3.721543


# Результаты:

Тестовые таски – это таски, которые мы в ходе эксперимента действительно аккуратно оценивали и трекали время.
На таких хороших тасках мы видим, что мы превзошли человеческую оценку сразу с бейзлайном 73% vs 33%



## Classification type of task

In [82]:
from sklearn.metrics import accuracy_score, classification_report

f_binner = lambda b: {b >= 2: 0, b >= 4: 1, b >= 8: 2, b > 8: 3}.get(True)
f_unbinner = lambda b: {0: 2, 1: 4, 2: 8, 3: 12}.get(b)

df["bin_ttch"] = df["time_to_complete_hours_transformed"].apply(f_binner)
y = df["bin_ttch"]

x = df["task_text"]
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

metrics = {
    "w_acc_test": [],
    "w_acc_train": [],
    "w_acc_test_transformed": [],
    "w_acc_train_transformed": [],

    "accuracy_test": [],
    "accuracy_train": [],
}

for train_index, test_index in skf.split(x, y):
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1, 3))),
        ("lr", LogisticRegression())
    ])

    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    pipe.fit(x_train, y_train)

    # evaluation
    keys = ["w_acc_test", "w_acc_train", "w_acc_test_transformed", "w_acc_train_transformed"]
    xs = [x_test, x_train, x_test, x_train]
    ys = [
        y_test, y_train,
        df.loc[test_index, "time_to_complete_hours_transformed"],
        df.loc[train_index, "time_to_complete_hours_transformed"]
    ]
    for key, x_, y_ in zip(keys, xs, ys):
        y_pred = pipe.predict(x_)
        y_pred = list(map(f_unbinner, y_pred))

        if "transformed" not in key:
            y_ = y_.apply(f_unbinner)

        metrics[key].append(weighted_accuracy(y_, y_pred))

    keys = ["accuracy_test", "accuracy_train"]
    xs = [x_test, x_train]
    ys = [y_test, y_train]

    for key, x_, y_ in zip(keys, xs, ys):
        y_pred = pipe.predict(x_)
        metrics[key].append(accuracy_score(y_, y_pred))
        print(key.capitalize(), '\n', classification_report(y_, y_pred, zero_division=0))

metrics = {
    k: {
        "mean": np.mean(v),
        "std": np.std(v)
    }
    for k, v in metrics.items()
}

pd.DataFrame(metrics)

Accuracy_test 
               precision    recall  f1-score   support

           0       0.43      0.97      0.59        67
           1       0.10      0.02      0.04        43
           2       1.00      0.04      0.07        27
           3       0.00      0.00      0.00        26

    accuracy                           0.41       163
   macro avg       0.38      0.26      0.18       163
weighted avg       0.37      0.41      0.27       163

Accuracy_train 
               precision    recall  f1-score   support

           0       0.79      1.00      0.88       133
           1       1.00      1.00      1.00        86
           2       1.00      0.75      0.85        55
           3       1.00      0.58      0.73        50

    accuracy                           0.89       324
   macro avg       0.95      0.83      0.87       324
weighted avg       0.91      0.89      0.89       324

Accuracy_test 
               precision    recall  f1-score   support

           0       0.41   

Unnamed: 0,w_acc_test,w_acc_train,w_acc_test_transformed,w_acc_train_transformed,accuracy_test,accuracy_train
mean,-0.190081,0.613564,-0.202928,0.586829,0.414792,0.879889
std,0.014213,0.024137,0.013411,0.026749,0.003658,0.009914


In [83]:
preds = pd.DataFrame(
    {
        "y_true": y_test_from_manual,
        "y_pred": map(f_unbinner, pipe.predict(x_test_from_manual)),
    }
)
preds.groupby("y_true")["y_pred"].mean()

y_true
2     2.022346
4     3.306122
6     3.478261
8     4.571429
10    5.333333
12    5.753425
Name: y_pred, dtype: float64

In [84]:
preds.corr()

Unnamed: 0,y_true,y_pred
y_true,1.0,0.496503
y_pred,0.496503,1.0


### Weights evaluation

In [85]:
words = pipe.named_steps["tfidf"].get_feature_names_out()
weights = pipe.named_steps["lr"].coef_[3]

weights_clf = pd.DataFrame({"word": words, "weight": weights, "abs_weight": abs(weights)})
weights_clf = weights_clf.sort_values("abs_weight", ascending=False)
weights_clf.head(10)

Unnamed: 0,word,weight,abs_weight
36508,model,0.461275,0.461275
50659,risk,-0.429035,0.429035
36887,monitoring,0.391628,0.391628
42565,pd002,0.374959,0.374959
33692,kyc,0.374289,0.374289
16373,curp,0.367559,0.367559
42626,pd003,0.359004,0.359004
16912,dag,0.347073,0.347073
51981,scoring,0.34693,0.34693
56811,tariff,0.288576,0.288576


# Сравнение весов для регрессии и классификации

In [86]:
import plotly.express as px

weights_regr["type"] = "regr"
weights_clf["type"] = "clf"

# normalize weights -1:1
weights_regr["weight"] = weights_regr["weight"] / weights_regr["abs_weight"].max()
weights_clf["weight"] = weights_clf["weight"] / weights_clf["abs_weight"].max()

n_top_words = 7
top_words = set(weights_regr["word"].head(n_top_words)).union(set(weights_clf["word"].head(n_top_words)))

weights = pd.concat([weights_regr, weights_clf])
weights = weights[weights["word"].isin(top_words)]

fig = px.bar(weights, x="word", y="weight", color="type", barmode="group")
fig.show()

# Результаты:

Мы видим, что задача классификации склонена недооценивать выполнение больших задач – да и в целом выдаются некоторые средние значения. В то время как регрессия показывает более точные результаты. Веса для регрессии и классификации различаются, что говорит о том, что модели используют разные признаки для предсказания времени выполнения задачи.