# Алгоритмы классификации: линейные методы, логистическая регрессия и SVM

Имеются данные adult.csv (см. в материалах к занятию)

Целевой переменной является уровень дохода income (крайний правый столбец).

Описание признаков можно найти по ссылке http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html

Вам необходимо построить модель логистической регрессии, которая предсказывает уровень дохода человека. При возможности попробуйте улучшить точность предсказаний (метод score) с помощью перебора признаков.

In [1]:
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Загружаем наши данные
df = pd.read_csv('/home/lena/Netology/ML/Lecture_2/adult.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age                48842 non-null int64
workclass          48842 non-null object
fnlwgt             48842 non-null int64
education          48842 non-null object
educational-num    48842 non-null int64
marital-status     48842 non-null object
occupation         48842 non-null object
relationship       48842 non-null object
race               48842 non-null object
gender             48842 non-null object
capital-gain       48842 non-null int64
capital-loss       48842 non-null int64
hours-per-week     48842 non-null int64
native-country     48842 non-null object
income             48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [5]:
# Как видим, много категориальных признаков. Будем дальше с ними работать. 

In [6]:
N = df[['age', 'workclass', 'education',  'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']]

In [7]:
df['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [8]:
# Заменим данные в столбце income на 0 и 1

df['income']=df['income'].map({'<=50K': 0, '>50K': 1})

In [9]:
df['income'].value_counts()

0    37155
1    11687
Name: income, dtype: int64

In [10]:
df.occupation.value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [11]:
# Заменим данные ? на Other-service
df.loc[(df['occupation'] == '?'), 'occupation'] = 'Other-service'

In [12]:
df.occupation.value_counts()

Other-service        7732
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [13]:
df.workclass.value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [14]:
# Заменим данные ? на Private
df.loc[(df['workclass'] == '?'), 'workclass'] = 'Private'

In [15]:
df.education.value_counts()

HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64

In [16]:
df['marital-status'].value_counts()

Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: marital-status, dtype: int64

In [17]:
df.relationship.value_counts()

Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: relationship, dtype: int64

In [18]:
df.race.value_counts()

White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: race, dtype: int64

In [19]:
df['native-country'].value_counts()

United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                        

In [20]:
df.loc[(df['native-country'] == '?'), 'native-country'] = 'United-States'

In [21]:
df['native-country'].value_counts()

United-States                 44689
Mexico                          951
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                             46
Ecuador                     

In [23]:
# Посмотрели категориальные признаки
# Дальше выберем 4 признака, которые по логике могут влиять на доход (случай №1)

In [24]:
SelectedColumns = df[['occupation', 'education', 'hours-per-week', 'gender', 'income']]

In [25]:
X = pd.get_dummies(SelectedColumns, columns = ['occupation', 'education', 'gender'])

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
scaler = StandardScaler()

In [28]:
X['hours-per-week'] = scaler.fit_transform(SelectedColumns[['hours-per-week']])

In [29]:
X.head()

Unnamed: 0,hours-per-week,income,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,...,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,gender_Female,gender_Male
0,-0.034087,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0.77293,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,-0.034087,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,-0.034087,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
4,-0.841104,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0


In [30]:
del X['income']

In [31]:
y = SelectedColumns.income

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state=42)

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
model = LogisticRegression()
model.fit( X_train, y_train )
y_predict = model.predict(X_test)



In [36]:
from sklearn.metrics import accuracy_score

In [37]:
accuracy_score(y_test, y_predict)

0.8033575596273927

In [38]:
# Вместо 4-х признаков возьмем почти все и посмотрим на результат в этом случае (случай №2)

In [39]:
SelectedColumns1 = df[['age', 'workclass', 'education',  'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']]

In [40]:
X1 = pd.get_dummies(SelectedColumns1, columns = ['workclass', 'education',  'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country'])

In [41]:
scaled_features = X1.copy()
col_names = ['age', 'capital-gain', 'capital-loss', 'hours-per-week' ]
features = scaled_features[col_names]
scaler1 = StandardScaler().fit(features.values)
features = scaler1.transform(features.values)
scaled_features[col_names] = features
X1 = scaled_features

In [42]:
X1_train, X1_test, y_train, y_test = train_test_split(X1, y, test_size=0.2 , random_state=42)

In [43]:
model1 = LogisticRegression()
model1.fit( X1_train, y_train )
y_predict1 = model1.predict(X1_test)



In [44]:
accuracy_score(y_test, y_predict1)

0.8548469648889344

In [53]:
# Уже неплохой score, когда мы используем почти все признаки, а не 4, как в первом случае
# Попробуем улучшить score с помощью RFE метода (случай №3)

In [46]:
from sklearn.feature_selection import RFE

In [47]:
model2 = LogisticRegression()

In [48]:
rfe = RFE(model2, 50)
rfe = rfe.fit(X1_train, y_train)





In [49]:
y_rfe_predict = rfe.predict(X1_test)

In [50]:
accuracy_score(y_test, y_rfe_predict)

0.8533114955471389

In [51]:
print("Ранжирование признаков: ", rfe.ranking_)
print("Количество признаков: = %.2f" % sum(rfe.support_))

Ранжирование признаков:  [12  1 21  8  1 23 42 43 30  4 16 11  1  1  1  1  1  1  1  1  1  1  1 46
  1  1  1  1  6  1  1  7  1  5  9 48 10 54  1  1  1 18  1  1  1 15 14  1
 38  1 29  1  1 50  1  1 51 22 36 37  1 13  1  1  1  1 35  1  1  1 26 17
 32 52 41 44 53 31 49 20 40 34  3 19 27 45  1  1  1  1  1 25 33 24 47  1
  1 39  2  1 28  1  1]
Количество признаков: = 50.00


In [52]:
# Посмотрим какие признаки отобрал данный метод 

[x for x in zip(X1_test.columns, rfe.support_) if x[1] == True]

[('capital-gain', True),
 ('workclass_Federal-gov', True),
 ('education_10th', True),
 ('education_11th', True),
 ('education_12th', True),
 ('education_1st-4th', True),
 ('education_5th-6th', True),
 ('education_7th-8th', True),
 ('education_9th', True),
 ('education_Assoc-acdm', True),
 ('education_Assoc-voc', True),
 ('education_Bachelors', True),
 ('education_Doctorate', True),
 ('education_Masters', True),
 ('education_Preschool', True),
 ('education_Prof-school', True),
 ('education_Some-college', True),
 ('marital-status_Married-AF-spouse', True),
 ('marital-status_Married-civ-spouse', True),
 ('marital-status_Never-married', True),
 ('occupation_Exec-managerial', True),
 ('occupation_Farming-fishing', True),
 ('occupation_Handlers-cleaners', True),
 ('occupation_Other-service', True),
 ('occupation_Priv-house-serv', True),
 ('occupation_Prof-specialty', True),
 ('occupation_Tech-support', True),
 ('relationship_Husband', True),
 ('relationship_Other-relative', True),
 ('relatio

In [54]:
# Теперь обучим лог регрессию на тех же данных, что и во втором случае, 
# только не будем их обрабатывать и стандартизировать (случай №4)

In [55]:
SelectedColumns4 = N

In [56]:
X4 = pd.get_dummies(SelectedColumns4, columns = ['workclass', 'education',  'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country'])

In [57]:
X4_train, X4_test, y_train, y_test = train_test_split(X4, y, test_size=0.2 , random_state=42)

In [58]:
model4 = LogisticRegression()
model4.fit( X4_train, y_train )
y_predict4 = model4.predict(X4_test)



In [59]:
accuracy_score(y_test, y_predict4)

0.8554611526256526

In [60]:
# Можно сделать вывод, что лог регрессия все сделает сама, 
# даже если не обрабатывать данные, не применять StandartScaler, 
# то accuracy будет очень высокое 