# 데이터 전처리

## import
> pandas, numpy, matplotlib

> get df

In [None]:
# pandas, numpy, matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# get df
dir = "/content/drive/MyDrive/DAT/data/diabetes_prediction_dataset.csv"
df = pd.read_csv(dir)

## 01. 데이터 수정
> 결측치 처리

> 중복된 행 제거

> 나이 변수 형변환
* float -> int

In [None]:
# 결측치 처리
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [None]:
# 중복된 행 제거
df.duplicated().sum()
df.shape

(100000, 9)

In [None]:
# 나이 변수 형변환
df['age'] = df['age'].astype(int)

## gender
> ~Others~

> dummy

In [None]:
# remove Others
df = df[df['gender'] != 'Other']

In [None]:
# dummy
gender_dummies = pd.get_dummies(df['gender'])
df = pd.concat([df, gender_dummies], axis = 1)
df=df.drop(['gender'],axis = 1)

## smoking_history
> smoking_history 열 전체 제거

In [None]:
# smoking_history 열 전체 제거
df.drop(["smoking_history"], axis = 1, inplace = True)

## diabetes
> 맨 오른쪽 열으로 이동

In [None]:
# 맨 오른쪽 열으로 이동
cols = df.columns.tolist()
cols.remove('diabetes')
df = df[cols+['diabetes']]

## train/test split
> set X, y

> train_test_split

In [None]:
# set X, y
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
# train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Undersampling / Oversampling | w/ logistic

## import
> recall_score, LogisticRegression

In [None]:
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression

## Undersampling
> Undersampling

> Logistic Regression

In [None]:
# Undersampling
from imblearn.under_sampling import RandomUnderSampler

max_dia = df[df['diabetes'] == 1]
min_dia = df[df['diabetes'] == 0]
undersample = RandomUnderSampler(random_state = 42)
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
y_train_under.value_counts()

0    6792
1    6792
Name: diabetes, dtype: int64

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_under, y_train_under)
lr.fit(X_train_under, y_train_under)

y_train_pred = lr.predict(X_train_under)
y_test_pred = lr.predict(X_test)

# recall score
print("recall_train:", recall_score(y_train_under, y_train_pred))
print("recall_test: ", recall_score(y_test, y_test_pred))

# accuracy score
print("train:", lr.score(X_train_under, y_train_under))
print("test: ", lr.score(X_test, y_test))

recall_train: 0.8805948174322733
recall_test:  0.8940281030444965
train: 0.8861896348645465
test:  0.8839825973896085


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Oversampling
> SMOTE

> Logistic Regression

In [None]:
# SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)
X_train_over, y_train_over = smote.fit_resample(X_train,y_train)
y_train_over.value_counts()

0    73193
1    73193
Name: diabetes, dtype: int64

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_over, y_train_over)
lr.fit(X_train_over, y_train_over)

y_train_pred = lr.predict(X_train_over)
y_test_pred = lr.predict(X_test)

# recall score
print("recall_train:", recall_score(y_train_over, y_train_pred))
print("recall_test: ", recall_score(y_test, y_test_pred))

# accuracy score
print("train:", lr.score(X_train_over, y_train_over))
print("test: ", lr.score(X_test, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


recall_train: 0.8945664202860929
recall_test:  0.8436768149882904
train: 0.9024291940486112
test:  0.9040356053408011


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
