In [4]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

csv_path = "bank-full.csv"
df = pd.read_csv(csv_path, sep=';')

cols = [
    'age','job','marital','education','balance','housing','contact',
    'day','month','duration','campaign','pdays','previous','poutcome','y'
]
df = df[cols].copy()

print("\n--- Проверка пропусков ---")
print(df.isna().sum())


--- Проверка пропусков ---
age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [5]:
mode_edu = df['education'].mode().iloc[0]
print(f"\nQ1. Самое частое значение education: {mode_edu}")


Q1. Самое частое значение education: secondary


In [6]:
num_cols = ['age','balance','day','duration','campaign','pdays','previous']
corr = df[num_cols].corr()
corr_vals = corr.abs().where(~np.eye(len(num_cols), dtype=bool))
max_pair = np.unravel_index(np.nanargmax(corr_vals.values), corr_vals.shape)
pair = (num_cols[max_pair[0]], num_cols[max_pair[1]])
print(f"\nQ2. Наибольшая корреляция между: {pair}, corr = {corr_vals.values[max_pair]:.3f}")

df['y'] = df['y'].map({'yes': 1, 'no': 0})

X = df.drop(columns=['y'])
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=42, stratify=y_trainval
)


Q2. Наибольшая корреляция между: ('pdays', 'previous'), corr = 0.455


In [8]:
cat_cols = ['job','marital','education','housing','contact','month','poutcome']
X_train_cat = X_train[cat_cols].fillna('NA')
X_train_cat_enc = pd.get_dummies(X_train_cat, drop_first=False)
mi = mutual_info_classif(X_train_cat_enc, y_train, discrete_features=True, random_state=42)
mi_series = pd.Series(mi, index=X_train_cat_enc.columns)

mi_by_feat = {}
for c in cat_cols:
    cols_i = [col for col in mi_series.index if col.startswith(c + "_")]
    mi_by_feat[c] = mi_series[cols_i].sum()

mi_rounded = {k: round(v, 2) for k, v in mi_by_feat.items()}
best_mi = max(mi_rounded.items(), key=lambda x: x[1])
print(f"\nQ3. Взаимная информация (MI): {mi_rounded}")
print(f"Наибольшая MI у признака: {best_mi[0]}")



Q3. Взаимная информация (MI): {'job': np.float64(0.01), 'marital': np.float64(0.0), 'education': np.float64(0.0), 'housing': np.float64(0.02), 'contact': np.float64(0.02), 'month': np.float64(0.03), 'poutcome': np.float64(0.04)}
Наибольшая MI у признака: poutcome


In [10]:
cat_all = ['job','marital','education','housing','contact','month','poutcome']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(X_train[cat_all].fillna('NA'))

def prep(Xdf):
    num = Xdf[['age','balance','day','duration','campaign','pdays','previous']].values
    cat = ohe.transform(Xdf[cat_all].fillna('NA'))
    return np.hstack([num, cat])

Xtr, Xvl = prep(X_train), prep(X_val)

In [11]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(Xtr, y_train)
acc_val = round(accuracy_score(y_val, model.predict(Xvl)), 2)
print(f"\nQ4. Точность на валидации: {acc_val}")


Q4. Точность на валидации: 0.9


In [15]:
features = ['age','balance','day','duration','campaign','pdays','previous'] + cat_all

def prep_feats(Xdf, feats):
    num_list = [f for f in feats if f in ['age','balance','day','duration','campaign','pdays','previous']]
    cat_list = [f for f in feats if f in cat_all]
    num = Xdf[num_list].values if num_list else np.empty((len(Xdf),0))
    if cat_list:
        enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        enc.fit(X_train[cat_list].fillna('NA'))
        cat = enc.transform(Xdf[cat_list].fillna('NA'))
    else:
        cat = np.empty((len(Xdf),0))
    return np.hstack([num, cat])

    base_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
base_model.fit(prep_feats(X_train, features), y_train)
base_acc = accuracy_score(y_val, base_model.predict(prep_feats(X_val, features)))

diffs = {}
for f in features:
    f_list = [x for x in features if x != f]
    m = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    m.fit(prep_feats(X_train, f_list), y_train)
    acc = accuracy_score(y_val, m.predict(prep_feats(X_val, f_list)))
    diffs[f] = round(base_acc - acc, 4)

min_feat = min(diffs.items(), key=lambda x: abs(x[1]))[0]
print(f"\nQ5. Разница точности при исключении признаков:\n{diffs}")
print(f"Наименьшая разница у признака: {min_feat}")


Q5. Разница точности при исключении признаков:
{'age': -0.0002, 'balance': -0.0003, 'day': -0.0004, 'duration': 0.0102, 'campaign': 0.0003, 'pdays': -0.0002, 'previous': 0.0, 'job': 0.0006, 'marital': 0.0007, 'education': -0.0006, 'housing': 0.0014, 'contact': -0.0001, 'month': 0.0007, 'poutcome': 0.0088}
Наименьшая разница у признака: previous


In [16]:
Cs = [0.01, 0.1, 1, 10, 100]
accs = {}
for C in Cs:
    m = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    m.fit(Xtr, y_train)
    accs[C] = round(accuracy_score(y_val, m.predict(Xvl)), 3)

best_C = min([c for c,v in accs.items() if v == max(accs.values())])
print(f"\nQ6. Точности при разных C: {accs}")
print(f"Лучшее значение C: {best_C}")


Q6. Точности при разных C: {0.01: 0.898, 0.1: 0.902, 1: 0.903, 10: 0.904, 100: 0.903}
Лучшее значение C: 10
