# Задание 2

## Построение и оценка бейзлайна

### Пункт 1 - Разбиение данных

In [3]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import os

In [4]:
csv_file = next((f for f in os.listdir() if f.endswith('.csv')), None)
if not csv_file: print("файл не найден.")

df = pd.read_csv(csv_file)

In [73]:
# подготовим данные
df_final = pd.concat([df.select_dtypes(include=['number']),
                      df[[col for col in df.select_dtypes(include=['object']) \
                            if df[col].nunique() < 5]]], axis=1)

categorical_cols = df_final.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df_final, columns=categorical_cols, drop_first=True)

X = df_encoded.drop(columns='Churn_Yes')
y = df_encoded['Churn_Yes']

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=.3, random_state=52, stratify=y)
print(f"разбиение train/test: {X_train.shape[0]} | {X_test.shape[0]}")

разбиение train/test: 4930 | 2113


### Константная модель

In [82]:
from sklearn.dummy import DummyClassifier
from sklearn import metrics as sk_metrics
dummy_models = [DummyClassifier(strategy=S) 
                for S in DummyClassifier._parameter_constraints["strategy"][0].options \
                if S !="constant"]

for m in dummy_models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    
    numvec = np.vectorize(lambda x: 1 if x == "Yes" or x == True else 0)
    _y_pred_num = numvec(y_pred)
    _y_test_num = numvec(y_test)
    
    acc = sk_metrics.accuracy_score(_y_test_num, _y_pred_num)
    f1 = sk_metrics.f1_score(_y_test_num, _y_pred_num)
    roc = sk_metrics.roc_auc_score(y_test, _y_pred_num)
    
    print(f"модель : {m.strategy:^12}\t| acc={acc:.3f}; f1={f1}; roc={roc:.3f};")

модель :    prior    	| acc=1.000; f1=1.0; roc=0.500;
модель : most_frequent	| acc=1.000; f1=1.0; roc=0.500;
модель :   uniform   	| acc=1.000; f1=1.0; roc=0.500;
модель :  stratified 	| acc=1.000; f1=1.0; roc=0.500;


по метрикам - все плохо. константные модели работают чуть лучше подбрасывания монетки.

### Baseline

возьмем несколько моделей и проверим их

In [85]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = {
    'Logistic Regression': LogisticRegression(random_state=52),
    'Decision Tree': DecisionTreeClassifier(random_state=52),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=52),
    'Gradient Boosting': GradientBoostingClassifier(random_state=52)
}

for n, m in models.items():
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    
    numvec = np.vectorize(lambda x: 1 if x == "Yes" or x == True else 0)
    _y_pred_num = numvec(y_pred)
    _y_test_num = numvec(y_test)
     
    acc = sk_metrics.accuracy_score(_y_test_num, _y_pred_num)
    f1 = sk_metrics.f1_score(_y_test_num, _y_pred_num)
    roc = sk_metrics.roc_auc_score(y_test, _y_pred_num)
    
    print(f"модель : {n:^12}\t| acc={acc:.3f}; f1={f1}; roc={roc:.3f};")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


модель : Logistic Regression	| acc=0.808; f1=0.5962113659022932; roc=0.720;
модель : Decision Tree	| acc=0.726; f1=0.4821428571428571; roc=0.648;
модель : K-Nearest Neighbors	| acc=0.782; f1=0.5610687022900763; roc=0.700;
модель : Random Forest	| acc=0.792; f1=0.5528455284552846; roc=0.694;
модель : Gradient Boosting	| acc=0.807; f1=0.5828220858895705; roc=0.711;
