In [2]:
import pandas as pd
import numpy as np
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [28]:
df = pd.read_csv("data/iris.csv")

In [29]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [30]:
df["Species"].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [37]:
#df["is_setosa"] = [1 if x == "setosa" else 0 for x in df["Species"]]
df["is_setosa"] = (df["Species"]=="setosa")+0

In [38]:
df.loc[:,["Species","is_setosa"]]

Unnamed: 0,Species,is_setosa
0,setosa,1
1,setosa,1
2,setosa,1
3,setosa,1
4,setosa,1
...,...,...
145,virginica,0
146,virginica,0
147,virginica,0
148,virginica,0


In [42]:
model = Logit(endog=df["is_setosa"],
             exog=df.iloc[:,:4]).fit()

PerfectSeparationError: Perfect separation detected, results not available

In [43]:
#위의 에러는 값이 너무 똑같거나 하면 발생하는 에러
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1


In [44]:
model = Logit(endog=df["is_setosa"],
             exog=df.iloc[:,:2]).fit()

Optimization terminated successfully.
         Current function value: 0.036374
         Iterations 11


In [45]:
df.iloc[:,:2].head()

Unnamed: 0,Sepal.Length,Sepal.Width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


In [46]:
model.params

Sepal.Length    -7.529945
Sepal.Width     13.130734
dtype: float64

In [47]:
model.pvalues

Sepal.Length    0.000828
Sepal.Width     0.000989
dtype: float64

In [52]:
pred = model.predict(df.iloc[:3,:2])
pred

0    0.999477
1    0.923824
2    0.998678
dtype: float64

In [53]:
(pred>0.5) + 0

0    1
1    1
2    1
dtype: int32

In [54]:
#model = LogisticRegression(random_state=123).fit()
model = LogisticRegression(random_state=123)
model.fit(X=df.iloc[:,:2],
         y = df["is_setosa"])

In [55]:
model.coef_

array([[-3.38829757,  3.1645277 ]])

In [56]:
model.intercept_

array([8.32330389])

In [65]:
pred = model.predict_proba(df.iloc[:3,:2])
print(pred)

[[0.10727976 0.89272024]
 [0.22895365 0.77104635]
 [0.07413821 0.92586179]]


In [66]:
pred = pred[:,1]
pred

array([0.89272024, 0.77104635, 0.92586179])

In [74]:
(pred > 0.5) + 0

array([1, 1, 1])

In [76]:
pred = model.predict_proba(df.iloc[:,:2])
pred = pred[:,1]
pred[:10]

array([0.89272024, 0.77104635, 0.92586179, 0.92738323, 0.94126096,
       0.91436651, 0.97058885, 0.89484454, 0.93034007, 0.82210603])

In [77]:
from sklearn.metrics import roc_auc_score

In [78]:
roc_auc_score(y_true=df["is_setosa"],
             y_score=pred)

0.9999999999999999

In [80]:
accuracy_score(y_true=df["is_setosa"],
              y_pred=(pred>0.9)+0)

0.8333333333333334

In [81]:
#나머지 함수들도 유사하다

In [94]:
#문제 1

In [95]:
from statsmodels.api import Logit
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [96]:
df = pd.read_csv("data/diabetes.csv")

In [97]:
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [98]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [141]:
df_sub = df.loc[:,["Outcome","BloodPressure", "Glucose", "BMI", "Insulin"]]

In [142]:
df_sub.head()

Unnamed: 0,Outcome,BloodPressure,Glucose,BMI,Insulin
0,1,72,148,33.6,0
1,0,66,85,26.6,0
2,1,64,183,23.3,0
3,0,66,89,28.1,94
4,1,40,137,43.1,168


In [143]:
train, test = train_test_split(df_sub, train_size=0.8, random_state=123)

#train, test = train_test_split(df, test_size=0.8, random_state=123)
#train = train.loc[:,["Outcome","BloodPressure", "Glucose", "BMI", "Insulin"]]
#test = test.loc[:,["Outcome","BloodPressure", "Glucose", "BMI", "Insulin"]]
                  

In [144]:
train.head()

Unnamed: 0,Outcome,BloodPressure,Glucose,BMI,Insulin
318,0,66,115,38.1,140
313,0,50,113,29.5,85
195,1,84,158,39.4,210
570,0,70,78,32.5,0
226,0,76,101,35.7,0


In [145]:
test.head()

Unnamed: 0,Outcome,BloodPressure,Glucose,BMI,Insulin
236,1,84,181,35.9,192
395,0,58,127,27.7,275
36,0,76,138,33.2,0
210,0,60,81,27.7,0
483,0,82,84,38.2,125


In [146]:
model = Logit(endog = train["Outcome"], exog = train.iloc[:,1:]).fit()

Optimization terminated successfully.
         Current function value: 0.626579
         Iterations 5


In [147]:
pred = model.predict(test.iloc[:,1:])
pred = (pred > 0.5) +0

In [148]:
test["Outcome"]

236    1
395    0
36     0
210    0
483    0
      ..
650    0
579    1
119    0
593    0
310    0
Name: Outcome, Length: 154, dtype: int64

In [149]:
pred

236    0
395    1
36     0
210    0
483    0
      ..
650    0
579    1
119    0
593    0
310    0
Length: 154, dtype: int32

In [150]:
accuracy_score(y_true=test["Outcome"], y_pred=pred)

0.7012987012987013

In [151]:
#문제 2

In [152]:
import pandas as pd
import numpy as np
from statsmodels.api import Logit

In [153]:
df = pd.read_csv("data/diabetes.csv")

In [154]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [186]:
df_ind=df.loc[:, ["Glucose", "BMI", "Age"]]

In [187]:
model0 = Logit(endog = df["Outcome"], exog=df_ind).fit()

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


In [188]:
model0.params

Glucose    0.009368
BMI       -0.035639
Age       -0.012898
dtype: float64

In [189]:
pred0 = model0.predict(df_ind)

In [190]:
pred0 = (pred0 > 0.5) + 0

In [191]:
odd = np.exp(model0.params["Age"])

In [192]:
odd

0.9871844697218117

In [193]:
#문제 3

In [197]:
import pandas as pd
from statsmodels.api import Logit
from sklearn.metrics import roc_auc_score

In [198]:
df = pd.read_csv("data/diabetes.csv")

In [199]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [202]:
model = Logit(endog=df["Outcome"], exog=df.loc[:,["Glucose","BMI","Age"]]).fit()

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


In [205]:
pred = model.predict(df.loc[:,["Glucose","BMI","Age"]])

In [206]:
pred

0      0.387961
1      0.365506
2      0.615678
3      0.392087
4      0.336654
         ...   
763    0.261357
764    0.373590
765    0.453351
766    0.377879
767    0.375465
Length: 768, dtype: float64

In [207]:
roc_auc_score(y_true=df["Outcome"], y_score=pred)

0.5414253731343283