In [37]:
import pandas as pd
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
df = pd.read_csv('../data/adult.data', sep=',', names=('age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'))
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [39]:
df2 = df.drop('education', axis=1)
df2 = pd.get_dummies(df2, columns=["workclass", 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country'])
df2

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,2174,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,0,0,40,>50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,0,0,20,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [40]:
X = df2.loc[:, df2.columns != 'income']
y = df2.loc[:, df2.columns == 'income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [41]:
# Инициализируем модель решающего дерева
model = DecisionTreeClassifier(random_state=42,
                               # функция для impurity ('gini' или 'entropy')
                               criterion='gini',
                               # максимальная глубина дерева +5-5
                               max_depth=10,
                               # минимальное число элементов в узле для разбиения (может быть долей)
                               min_samples_split=5,
                               # минимальное число элементов в листе (может быть долей)
                               min_samples_leaf=5,
                               # Минимальное значение дельты impurity
                               # min_impurity_decrease=0,
                               # веса для классов (можно дополнительно штрафовать за ошибку в нужных классах).
                               # Поддерживает опцию 'balanced'.
                               class_weight=None
                               )

# Обучаем модель
model.fit(X_train, y_train)

In [42]:
pd.DataFrame({'feature': df2.loc[:, df2.columns != 'income'].columns,
              'importance': model.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
17,marital-status_ Married-civ-spouse,0.384086
2,educational-num,0.213354
3,capital-gain,0.192024
4,capital-loss,0.059406
0,age,0.051235
...,...,...
57,native-country_ Ecuador,0.000000
58,native-country_ El-Salvador,0.000000
59,native-country_ England,0.000000
60,native-country_ France,0.000000


In [43]:
# Строим предсказание модели
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

conf_mat = metrics.confusion_matrix(y_test, y_pred_test) # посмотреть на другие метрики
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_) / len(y_test)
conf_mat

Unnamed: 0,<=50K,>50K
<=50K,0.688908,0.069328
>50K,0.079937,0.161828


In [44]:
conf_mat = metrics.confusion_matrix(y_train, y_pred_train)
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_) / len(y_train)
conf_mat

Unnamed: 0,<=50K,>50K
<=50K,0.701169,0.058492
>50K,0.07449,0.165849


In [45]:
model = RandomForestClassifier(random_state=42,
                               # число деревьев в лесу
                               n_estimators=15,
                               # функция для impurity ('gini' или 'entropy')
                               criterion='gini',
                               max_depth=5,
                               # Вычислять out-of-bag ошибку
                               oob_score=True,
                               # использовать результаты предыдущего вызова и нарастить предыдущий лес
                               warm_start=False,
                               # веса классов для балансировки обучения
                               class_weight=None
                               )
model.fit(X_train, y_train)

  model.fit(X_train, y_train)
  warn(


In [46]:
y_pred = model.predict(X_test)
print(metrics.classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       <=50K       0.98      0.84      0.90      9526
        >50K       0.40      0.86      0.55      1220

    accuracy                           0.84     10746
   macro avg       0.69      0.85      0.72     10746
weighted avg       0.91      0.84      0.86     10746



In [47]:
pd.DataFrame({'feature': X.columns,
              'importance': model.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,capital-gain,0.198004
37,relationship_ Husband,0.130652
2,educational-num,0.107817
19,marital-status_ Never-married,0.093076
17,marital-status_ Married-civ-spouse,0.086169
...,...,...
66,native-country_ Honduras,0.000000
58,native-country_ El-Salvador,0.000000
64,native-country_ Haiti,0.000000
63,native-country_ Guatemala,0.000000


In [48]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred_test)
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_) / len(y_test)
conf_mat

Unnamed: 0,<=50K,>50K
<=50K,0.741857,0.016378
>50K,0.144612,0.097152


In [49]:
conf_mat = metrics.confusion_matrix(y_train, y_pred_train)
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_) / len(y_train)
conf_mat

Unnamed: 0,<=50K,>50K
<=50K,0.740912,0.018749
>50K,0.141691,0.098648


In [50]:
model = GradientBoostingClassifier(random_state=42,
                                   n_estimators=15,
                                   # функция для impurity ('friedman_mse' или 'squared_error')
                                   criterion='squared_error',
                                   max_depth=5,
                                   # использовать результаты предыдущего вызова и нарастить предыдущий лес
                                   warm_start=False,
                                   )
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [51]:
pd.DataFrame({'feature': X.columns,
              'importance': model.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
17,marital-status_ Married-civ-spouse,0.428685
2,educational-num,0.218249
3,capital-gain,0.209185
4,capital-loss,0.049223
0,age,0.041034
...,...,...
54,native-country_ Columbia,0.000000
55,native-country_ Cuba,0.000000
56,native-country_ Dominican-Republic,0.000000
58,native-country_ El-Salvador,0.000000


In [52]:
# Строим предсказание модели
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
conf_mat = metrics.confusion_matrix(y_test, y_pred_test)
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_) / len(y_test)
conf_mat

Unnamed: 0,<=50K,>50K
<=50K,0.730411,0.027824
>50K,0.11502,0.126745


In [53]:
conf_mat = metrics.confusion_matrix(y_train, y_pred_train)
conf_mat = pd.DataFrame(conf_mat, index=model.classes_, columns=model.classes_) / len(y_train)
conf_mat

Unnamed: 0,<=50K,>50K
<=50K,0.730002,0.029658
>50K,0.111895,0.128444
