## 로지스틱 회귀

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/강의자료/머신러닝기초/3일차_회귀

/content/drive/MyDrive/강의자료/머신러닝기초/3일차_회귀


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression

```python
df = pd.read_csv('./datasets/초등_4학년_수학_차시문항풀이이력.csv', index_col=0)
pd.set_option('display.max_columns', len(df.columns))
df.head()
```

```python
df['정오답'] =  df['정오답'].map({'O': 1, 'X': 0})
```

Q. 어떤 피쳐가 정오답에 영향을 미칠까?

```python
df = df[['정오답', '강의타입', '동영상재생시간', '실제재생시간', '학습일', '문항코드', '대단원코드', '중단원코드', '소단원코드', '토픽코드', '난이도', '평가영역']]
```

```python
df = df.dropna(subset=['정오답'])
```

```python
df['학습일'] = df.학습일.apply(pd.to_datetime)
df['hour'] = df.학습일.apply(lambda x: x.hour)
```

카테고리형 변수 원핫인코딩

```python
df = pd.get_dummies(df, columns=['강의타입', '문항코드', '대단원코드', '중단원코드', '소단원코드', '토픽코드', '난이도', '평가영역', 'hour'])
```

```python
df = df.drop(labels='학습일', axis=1)
```

```python
X = df.iloc[:,1:].to_numpy()
Y = df['정오답'].to_numpy()
```

```python
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train , X_test, y_train , y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
```

```python
from sklearn.metrics import accuracy_score, roc_auc_score

# 로지스틱 회귀를 이용하여 학습 및 예측 수행.
# solver인자값을 생성자로 입력하지 않으면 solver='lbfgs'  
lr_clf = LogisticRegression() # solver='lbfgs'
lr_clf.fit(X_train, y_train)
lr_preds = lr_clf.predict(X_test)

# accuracy와 roc_auc 측정
print('accuracy: {0:.3f}, roc_auc:{1:.3f}'.format(accuracy_score(y_test, lr_preds),
                                                 roc_auc_score(y_test , lr_preds)))
```

#### 더 성능을 높여보자.

재생시간 스케일링

```python
df[['동영상재생시간','실제재생시간']]
```

```python
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df[['동영상재생시간','실제재생시간']])
```

```python
df[['동영상재생시간','실제재생시간']] = data_scaled
```

```python
X = df.iloc[:,1:].to_numpy()
Y = df['정오답'].to_numpy()
```

```python
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train , X_test, y_train , y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
```

```python
# 로지스틱 회귀를 이용하여 학습 및 예측 수행.
# solver인자값을 생성자로 입력하지 않으면 solver='lbfgs'  
lr_clf = LogisticRegression() # solver='lbfgs'
lr_clf.fit(X_train, y_train)
lr_preds = lr_clf.predict(X_test)

# accuracy와 roc_auc 측정
print('accuracy: {0:.3f}, roc_auc:{1:.3f}'.format(accuracy_score(y_test, lr_preds),
                                                 roc_auc_score(y_test , lr_preds)))
```

```python
pd.DataFrame(data=[lr_preds, y_test])
```