In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

import statsmodels.api as sm

In [None]:
df = pd.read_csv('../data/hsb2.csv', index_col=0)
df['honcomp'] = 0

for key in df.index.values:
    if df.loc[key,'write'] >= 60:
        df.loc[key,'honcomp'] = 1

df

In [None]:
independent_variables = df.drop('honcomp', axis=1).drop('race', axis=1).drop('ses', axis=1).drop('schtyp', axis=1).drop('prog', axis=1).drop('write', axis=1).drop('math', axis=1).drop('socst', axis=1)
x = independent_variables.values
y = df['honcomp'].values

lr = LogisticRegression(C=1e8)
lr.fit(x, y)
y_pred = lr.predict(x)

print('Coefficients = ', lr.coef_)

In [None]:
y_pred

In [None]:
lr.predict_proba(x)

In [None]:
print('Intercept = ', lr.intercept_)

In [None]:
print('Accuracy {}'.format(lr.score(x, y)))

In [None]:
print('Confusion = \n{}'.format(metrics.confusion_matrix(y, y_pred, labels=[0,1])))

In [None]:
print(classification_report(y, y_pred))

In [None]:
x = independent_variables
y = df['honcomp']

x2 = sm.add_constant(x)
logit_model=sm.Logit(y,x2)
result=logit_model.fit()
result.summary()

In [None]:
logit_roc_auc = roc_auc_score(y, lr.predict(x))
fpr, tpr, thresholds = roc_curve(y, lr.predict_proba(x)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()