In [1]:
# pip install category_encoders

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from sklearn.model_selection import train_test_split, GridSearchCV, \
                                    ShuffleSplit, KFold

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.preprocessing import MinMaxScaler, StandardScaler, \
                                  RobustScaler, PolynomialFeatures, \
                                  OrdinalEncoder, LabelEncoder, \
                                  OneHotEncoder, TargetEncoder, \
                                  QuantileTransformer, PowerTransformer, \
                                  KBinsDiscretizer

from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression, RANSACRegressor, \
                                 Ridge, Lasso, LinearRegression

from sklearn.metrics import recall_score, precision_score, \
                            f1_score, ConfusionMatrixDisplay, \
                            confusion_matrix, roc_auc_score, \
                            RocCurveDisplay, PrecisionRecallDisplay, \
                            roc_curve, precision_recall_curve, \
                            PrecisionRecallDisplay, make_scorer, \
                            mean_squared_error

from sklearn.compose import ColumnTransformer, make_column_selector,\
                            make_column_transformer, TransformedTargetRegressor
import category_encoders as ce

### Data preparation

In [3]:
#Чтение файла
data_raw = pd.read_csv()
data_raw.head(10)

TypeError: read_csv() missing 1 required positional argument: 'filepath_or_buffer'

In [None]:
#Инфа
display(data_raw.info())
display(data_raw.describe())
display(data_raw.isna().sum())

In [None]:
#Дубликаты и пропуски
data_not_raw = data_raw.dropna(axis=0) #если допаешь, то измени контейнер в feature engineering
data = data_not_raw.drop_duplicates(keep='first')

In [None]:
#Крутой график
sns.pairplot(data, hue=, palette='hls')

In [None]:
#Корреляционная матрица
numeric_columns = data.select_dtypes(include=['float64', 'uint64', 'int64'])
correlation_matrix = numeric_columns.corr(method = 'pearson')
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
plt.figure(figsize=(9, 7))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5, square=True, vmin = -1, vmax = 1)
plt.title('Корреляционная матрица для числовых данных', fontsize=16)
plt.show()

In [None]:
#Коробки для фичей
plot_cols = data.columns
n_cols = 3
n_rows = (len(plot_cols) + n_cols - 1) // n_cols
fig, ax = plt.subplots(ncols=n_cols, nrows=n_rows, figsize=(18, n_rows * 5))

ax = ax.flatten()
for index, col in enumerate(plot_cols):
    sns.boxplot(y=col, data=df, ax=ax[index], hue='')

for empty_ax in ax[len(plot_cols):]:
    empty_ax.set_visible(False)

plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)
plt.show()

In [None]:
#Выбросы
# z = stats.zscore(data)
# display(z.head(10))
# data = data.drop(axis=1, columns=[''])[~(np.abs(z) > 3).any(axis=1)]
# data.head(3)

In [None]:
#Дробим данные
train_data, test_data, train_target, test_target = train_test_split(data.drop(axis=1, columns=['']), pd.DataFrame(data.), test_size=0.3, stratify=pd.DataFrame(data.), shuffle=True, random_state=13)#тут надо заполнить колонку с таргетом

### Feature engineering

In [None]:
#Для начала, определим вид наших фичей
display(train_data.info())

In [15]:
#Запишем виды фичей по листам
category_features = []
numerical_features = []

In [26]:
nums = Pipeline(steps=[
    ('missing_nums', KNNImputer(missing_values=np.nan)), #проверить, дропнул ли na в eda, если да, то бесполезный код
    ('bins', KBinsDiscretizer(n_bins=10, strategy='uniform', encode='onehot-dense')), #делаем бинаризацию
    ('polynom', PolynomialFeatures(3, include_bias=False)), #делаем полиномы, если надо
    ('scaler', StandardScaler())]) #можно попробовать другие скейлеры

#если нужна замена признаков на численные, не просто onehot
clarity_map = [{
    'col':'clarity',
    'mapping':{'FL':10, 'IF':9, 'VVS1':8, 'VVS2':7, 'VS1':6, 'VS2':5, 
               'SI1':4, 'SI2':3, 'I1':2, 'I2':1, 'I3':0}
    }]

special_transformer = Pipeline(steps=[
    ('missing_nums', SimpleImputer(strategy = 'most_frequent')),
    ('ce',ce.OrdinalEncoder(mapping=clarity_map)),
    ('scaler', MinMaxScaler())
    ])    
    
CT = ColumnTransformer([
        ("with_nums", nums, numerical_features),
        ("with_category", OneHotEncoder(sparse_output=False, handle_unknown='warn'), category_features), #если не одна категория, то лучше прописать параметр drop
        ("special_map", special_transformer, ['clarity']), #если нужна особая замена
        ]).set_output(transform='pandas')

display(CT)

# ct = CT.fit_transform(train_data)
# pd.DataFrame(ct).head().T


In [None]:
#Можно что-нибудь попроще написать
# preprocessor_universal = Pipeline(steps=[
#     ('first_step',),
#     ('second_step',)
# ])

#### БИНЫ

In [None]:
#Бины
kb = KBinsDiscretizer(n_bins=10, strategy='uniform', encode='onehot-dense')#число бинов и вид
kb.fit(train_data[numerical_features])
X_binned = kb.transform(train_data[numerical_features])
poly = PolynomialFeatures(3, include_bias=False)#полиномчики по приколу
data_combined = poly.fit_transform(np.hstack([train_data, X_binned]))# опционально можно связать путем X_binned*train_data, но тогда надо менять и в тесте

test_binned = kb.transform(test_data)
test_combined = poly.fit_transform(np.hstack([test_data, test_binned]))
#по итогу имеем трейн в виде data_combined и test в виде test_combined, можно пихать в модель

In [None]:
#Лоховские бины
kb = KBinsDiscretizer(n_bins=10, strategy='uniform')
kb.fit(train_data)
X_binned = kb.transform(train_data)

test_binned = kb.transform(test_data)

### Model + CT pipeline

In [32]:
model_pipeline = Pipeline(steps=[
    ('preprocessing', CT), #забиваем свой препроцессор, если надо, можно написать в feature engineering universal_preprocessor
    ('model', SVR())]) #забиваем свою модель, если надо
display(model_pipeline)

### Model with targettransformer

In [42]:
model_pipe_tt = Pipeline([
                ('modeltt', TransformedTargetRegressor(
                        regressor = Pipeline(steps=[
                                ('preprocessing', CT),
                                ('model', SVR())
                            ]),
                        transformer=QuantileTransformer(n_quantiles=1000, output_distribution="normal") #забиваем PowerTransformer(), если надо, не забывай про check_inverse = False
                ))])
display(model_pipe_tt)

### Пошла родная решеточка

In [None]:
# Тут берем метрику, которая нравится
scores='mean_squared_error'

In [None]:
# Тут возьмем cv которая нравится
cv = KFold(n_splits=7, shuffle=True)

In [None]:
#Листы моделей и скейлеров, чтобы не забыть
reg_model = [KNeighborsRegressor(), LinearRegression(), RANSACRegressor(), Ridge(), Lasso(), SVR()]
class_model = [KNeighborsClassifier(), LogisticRegression(), SVC()]
scalers = [MinMaxScaler(), StandardScaler(), RobustScaler()]

In [None]:
param_grid_for_model_pipeline = [
    {'model':[],#модели
     'preprocessing':[]}#перпроцессер
]

In [None]:
param_grid_for_model_pipe_tt = [
    {'modeltt__regressor__model':[],#тут пишем нужные модели
     'modeltt__regressor__model__',#дописать нужные параметры для моделей
     'modeltt__regressor__preprocessing__nums__scaler': [], #скейлер
     'modeltt__transformer':[PowerTransformer(),QuantileTransformer(n_quantiles=1000, output_distribution="uniform")]}]

In [None]:
# Сама решетка, берем нужный пайп, нужные параметры, нужный скор, нужный cv
gridcv = GridSearchCV(pipe, n_jobs=-1, param_grid=param_grid, scoring=scores, return_train_score=True, error_score="raise")
gridcv.fit(train_data, train_target)

In [None]:
grid_result = pd.DataFrame(gridcv.cv_results_).sort_values(["rank_test_score",'std_test_score']).T
grid_result

#### Nested cv, хз зачем

In [None]:
NUM_TRIALS = 30
for i in range(NUM_TRIALS):
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    clf = GridSearchCV(pipe, n_jobs=-1, param_grid=param_grid, scoring=scores, return_train_score=True, error_score="raise")
    nested_score = cross_val_score(clf, X=train_data, y=train_target, cv=outer_cv)
    nested_scores[i] = nested_score.mean()
score_nest = nested_scores.mean()

### Короночка

In [None]:
king = gridcv.best_estimator_
king.fit(train_data, train_target)
result = king.predict(test_data)
print("Правильность для наилучшей модели на тестовом наборе: {:.6f}\n".format(score(test_target, result)))#тут пишем свой скор

### А по итогу все равно будет как-то так)

In [None]:
cv = ShuffleSplit(n_splits=5, random_state=10)
pipe_kneighbors = Pipeline([('preprocessing', StandardScaler()), ('classifier', KNeighborsClassifier())])

n_neighbors=[1, 3, 5, 9, 12, 15, 20, 25, 30, 35, 40]
p=[2, 3, 4, 6, 8]
weights=['uniform','distance']

param_grid =[
    {'preprocessing': [MinMaxScaler(), StandardScaler(), RobustScaler()],
     'classifier': [KNeighborsClassifier()],
     'classifier__n_neighbors': n_neighbors,
     'classifier__p': p,
     'classifier__weights': weights
     }
]

grid = GridSearchCV(pipe_kneighbors, param_grid, cv=cv, return_train_score=True, n_jobs=-1, scoring='precision')
grid.fit(features_train, target_train.values.ravel())

print("----------------- Обучили и тестировали -------------------")
print("Наилучшие параметры:\n{}\n".format(grid.best_params_))
print("Средняя правильность для наилучшей модели кроссвалидации на \
                валидационных тестовых наборах: {:.6f}\n".format(grid.best_score_)) 
gridresults = pd.DataFrame(grid.cv_results_)
display(gridresults.sort_values(["rank_test_score"]).T)

In [None]:
pipe_1 = make_pipeline(MinMaxScaler(), KNeighborsClassifier(n_jobs=-1, n_neighbors=30, p=4, weights='distance'))
pipe_1.fit(features_train,target_train)
print("Правильность модели на тестовом наборе: {:.6f}\n".format(precision_score(target_test, pipe_1.predict(features_test))))
RocCurveDisplay.from_estimator(pipe_1, features_test, target_test, name = 'pipe');

In [None]:
matrix = confusion_matrix(target_test, pipe_1.predict(features_test))
ConfusionMatrixDisplay(matrix).plot()

In [None]:
print("Правильность модели на тестовом наборе: {:.6f}\n".format(precision_score(target_test, pipe_1.predict(features_test))))


def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return idx
fpr, tpr, thresholds = roc_curve(target_test, pipe_1.predict_proba(features_test)[:,1], pos_label = None)
pos = 14
point = find_nearest(thresholds,0.5)
fig, [ax1,ax2] = plt.subplots(1, 2, figsize=(11, 5))
RocCurveDisplay.from_estimator(pipe_1, features_test, target_test, name = 'pipe_2', ax=ax1)
ax1.set_title("ROC curve")
ax1.plot([0,1],[0,1], color="r", ls=":")
ax1.plot(fpr[point],tpr[point], '*', markersize=10, color="r")
ax1.plot(fpr[pos],tpr[pos], 'o', markersize=10,  mfc='none', color="r");
ax1.set_xlim((0, 1))
ax1.set_ylim((0, 1))
precision, recall, thresholds_pr = precision_recall_curve(target_test, pipe_1.predict_proba(features_test)[:,1], pos_label = None)
PrecisionRecallDisplay.from_estimator(pipe_1, features_test, target_test, name = 'pipe_2', ax=ax2)
point_pr = find_nearest(thresholds_pr,0.5)
pos_pr = find_nearest(thresholds_pr,thresholds[pos])
ax2.plot(recall[point_pr],precision[point_pr], '*', markersize=10, color="r")
ax2.plot(recall[pos_pr],precision[pos_pr], 'o', markersize=10,  mfc='none', color="r");
ax2.set_xlim((0, 1))
ax2.set_ylim((0, 1))
print( f'recall:  {recall[pos_pr]:.3f}', f' precision:, {precision[pos_pr]:.3f}', f' thresholds:,{thresholds[pos]:.3f}')
print( f'recall:  {recall[point_pr]:.3f}', f' precision:, {precision[point_pr]:.3f}', f' thresholds:,{thresholds[point]:.3f}')

In [None]:
# def custom_predict(X, threshold):
#     probs = model.predict_proba(X) 
#     return (probs[:, 1] > threshold).astype(int)
    
    
# new_preds = custom_predict(X=X, threshold=0.4) 
FixedThresholdClassifier
classifier_other_threshold = FixedThresholdClassifier(
    classifier, threshold=0.1, response_method="predict_proba"
).fit(X_train, y_train)