# ДЗ #5. Метод опорных векторов  <a id="0"></a>

### Датасет: https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008

* [Загрузка данных и базовые статистики](#1)
* [Обработка переменных typeObject](#2)
    * Race
    * Gender
    * Age
    * Change
    * DiabetesMed
    * One-Hot
* [Результат](#3)

In [1]:
# Импорт основных библиотек
import os
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
warnings.filterwarnings("ignore")
np.random.seed(42)

---
<a id="1"></a>
## Загрузка данных и базовые статистики [(Наверх)](#0) 

In [2]:
filename = './data/diabetic_data.csv'

if not os.path.exists(filename):
    raise SystemExit('Файл не найден!')
else:
    diabetic_df = pd.read_csv(filename, na_values=['?', 'None'])

# Проверим корректно ли загружен CSV
diabetic_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [3]:
# Узнаем размер датасета
diabetic_df.shape

(101766, 50)

In [4]:
# Узнаем свойства датасета
diabetic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      99493 non-null   object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    3197 non-null    object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                61510 non-null   object
 11  medical_specialty         51817 non-null   object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [5]:
# Удалим ненужные категории и те, где NaN > 80%
diabetic_df.drop(['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'max_glu_serum', 
                  'A1Cresult', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3'], axis=1, inplace=True)

# Также удалим колонки с названиями препаратов:
diabetic_df.drop(['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
                  'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone',
                  'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin',
                  'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone',
                  'metformin-pioglitazone'], axis=1, inplace=True)

In [6]:
# Проверим на дубликаты и удалим если есть
diabetic_df.drop_duplicates(inplace=True)
diabetic_df.duplicated().sum()

0

In [7]:
# Обработаем NaN и проверим на наличие пустых данных
diabetic_df.dropna(inplace=True)
diabetic_df.isnull().sum()

race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
number_diagnoses            0
change                      0
diabetesMed                 0
readmitted                  0
dtype: int64

---
<a id="2"></a>
## Обработка переменных typeObject [(Наверх)](#0) 

In [8]:
# Выберем только переменые 'object'
diabetic_df.select_dtypes(include = ['object']).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99473 entries, 0 to 101765
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   race         99473 non-null  object
 1   gender       99473 non-null  object
 2   age          99473 non-null  object
 3   change       99473 non-null  object
 4   diabetesMed  99473 non-null  object
 5   readmitted   99473 non-null  object
dtypes: object(6)
memory usage: 5.3+ MB


### Race

In [9]:
diabetic_df.race.value_counts()

Caucasian          76084
AfricanAmerican    19205
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64

### Gender

In [10]:
diabetic_df.gender.value_counts()

Female             53566
Male               45906
Unknown/Invalid        1
Name: gender, dtype: int64

In [11]:
# Удалим ошибочные данные
diabetic_df.drop(diabetic_df[diabetic_df.gender=='Unknown/Invalid'].index, inplace=True)

### Age

In [12]:
diabetic_df.age.value_counts()

[70-80)     25463
[60-70)     21986
[50-60)     16890
[80-90)     16799
[40-50)      9461
[30-40)      3699
[90-100)     2723
[20-30)      1610
[10-20)       681
[0-10)        160
Name: age, dtype: int64

In [13]:
# Т.к. возраст разбит на возрастающие группы, прокатегоризируем его
diabetic_df['age_encode'] = diabetic_df.age.factorize(sort=True)[0]
diabetic_df.drop(['age'], axis=1, inplace=True)

### Change

In [14]:
diabetic_df['change'].value_counts()

No    53573
Ch    45899
Name: change, dtype: int64

In [15]:
# Сконвертируем категорийное значение 'change' в бинарный вид
change_bool = {'No':False, 'Ch':True}
diabetic_df['change'] = diabetic_df['change'].map(change_bool)

### DiabetesMed

In [16]:
diabetic_df['diabetesMed'].value_counts()

Yes    76475
No     22997
Name: diabetesMed, dtype: int64

In [17]:
# Сконвертируем категорийное значение 'change' в бинарный вид
dm_bool = {'No':False, 'Yes':True}
diabetic_df['diabetesMed'] = diabetic_df['diabetesMed'].map(dm_bool)

### One-Hot

In [18]:
to_dummies = ['race', 'gender', 'readmitted']
diabetic_df = pd.get_dummies(diabetic_df, columns=to_dummies)

In [19]:
# Посмотрим, что получилось
diabetic_df

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,readmitted_<30,readmitted_>30,readmitted_NO
0,6,25,1,1,41,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,1
1,1,1,7,3,59,0,18,0,0,0,...,0,0,1,0,0,1,0,0,1,0
2,1,1,7,2,11,5,13,2,0,1,...,1,0,0,0,0,1,0,0,0,1
3,1,1,7,2,44,1,16,0,0,0,...,0,0,1,0,0,0,1,0,0,1
4,1,1,7,1,51,0,8,0,0,0,...,0,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,1,3,7,3,51,0,16,0,0,0,...,1,0,0,0,0,0,1,0,1,0
101762,1,4,5,5,33,3,18,0,0,1,...,1,0,0,0,0,1,0,0,0,1
101763,1,1,7,1,53,0,9,1,0,0,...,0,0,1,0,0,0,1,0,0,1
101764,2,3,7,10,45,2,21,0,0,1,...,0,0,1,0,0,1,0,0,0,1


---
<a id="1"></a>
## ML [(Наверх)](#0) 

In [20]:
# Импортируем библиотеки для ML
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [21]:
# Процедура оценки качества
def quality_report(prediction, actual, label='', report_df=None):
    metrics_list = [accuracy_score(prediction, actual), 
                    precision_score(prediction, actual), 
                    recall_score(prediction, actual), 
                    f1_score(prediction, actual),
                    roc_auc_score(prediction, actual)]
    
    if report_df is None:
        report_df = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC']).astype('float32')
    
    report_df.loc[label, :] =  metrics_list
    
    return report_df.round(3)

In [22]:
# Подготовка сетов
X_train, X_test, y_train, y_test = train_test_split(
    diabetic_df.drop(labels=['diabetesMed'], axis=1),
    diabetic_df['diabetesMed'],
    test_size=0.3,
    stratify=diabetic_df['diabetesMed'],
    random_state=0)

In [23]:
# Шкалирование
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# Логистическая регрессия
log_reg = LogisticRegression(random_state=42).fit(X_train_scaled, y_train)
log_reg.predict(X_test_scaled)
report = quality_report(log_reg.predict(X_test_scaled), y_test.values, label='log-reg')

In [25]:
# Support vectors
svc = svm.SVC(gamma='auto', random_state=42).fit(X_train_scaled, y_train)
report = quality_report(svc.predict(X_test_scaled), y_test.values, label='svm', report_df=report)

---
<a id="3"></a>
## Результат [(Наверх)](#0) 

In [26]:
report

Unnamed: 0,Accuracy,Precision,Recall,F1,ROC-AUC
log-reg,0.786,0.952,0.805,0.872,0.699
svm,0.788,0.961,0.803,0.875,0.713
