# Logistic Regression  
逻辑回归

Import useful packages.  
载入有用的程序包。

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import accuracy_score,log_loss,precision_score,roc_auc_score,confusion_matrix
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import statsmodels.api as sm
from scipy import stats

Read the dataset.  
读入数据

In [2]:
train_set=pd.read_csv("gold_train.csv")
test_set=pd.read_csv("gold_test.csv")
train_col=list(train_set)[1:]

In [3]:
train_X=train_set[train_col]
train_y=train_set['target']

test_X=test_set[train_col]
test_y=test_set['target']

In [4]:
print(train_X.head())
print(train_y.head())


     Return  gold Return_1  gold Return_2  gold Return_3  gold Return_4  \
0  0.012297       0.001726      -0.004428       0.000492      -0.001352   
1  0.012797       0.012297       0.001726      -0.004428       0.000492   
2 -0.005526       0.012797       0.012297       0.001726      -0.004428   
3 -0.001453      -0.005526       0.012797       0.012297       0.001726   
4 -0.006796      -0.001453      -0.005526       0.012797       0.012297   

   gold Return_5  gold Return_6  gold Return_7  gold Return_8  gold Return_9  \
0       0.009804       0.001240       0.000248       0.009884       0.008577   
1      -0.001352       0.009804       0.001240       0.000248       0.009884   
2       0.000492      -0.001352       0.009804       0.001240       0.000248   
3      -0.004428       0.000492      -0.001352       0.009804       0.001240   
4       0.001726      -0.004428       0.000492      -0.001352       0.009804   

   ...  gold Return_41  gold Return_42  gold Return_43  gold Return_

In [5]:
scaler = StandardScaler()
scaler.fit(train_X)

train_X=scaler.transform(train_X)
test_X=scaler.transform(test_X)

Build model.  
建立模型并作出预测。

In [6]:
clf_logisticR= linear_model.LogisticRegression(random_state=123)
clf_logisticR.fit(train_X,train_y)
# get the actual prediction.
pred=clf_logisticR.predict(test_X) 
# get the probility
pred_prob=clf_logisticR.predict_proba(test_X) 
# save the prediction
pred_prob_df=pd.DataFrame(pred_prob,columns=["pred0","pred1"])
pred_prob_df.to_csv("logistic regression.csv",index=False)

Print the metrics, show the performance of the model.  
输出指标，得到模型的表现

In [7]:
confusion_matrix(test_y,pred)

array([[83, 58],
       [60, 49]], dtype=int64)

In [8]:
print("log_loss: ",log_loss(test_y,pred_prob))
print("accuracy: ",accuracy_score(test_y,pred))
print("auc :     ",roc_auc_score(test_y,pred_prob[:,1]))
print("precision:",precision_score(test_y,pred))

log_loss:  0.7251426158986086
accuracy:  0.528
auc :      0.5065391372242827
precision: 0.45794392523364486


In [9]:
pred=(pred_prob[:,1] > 0.5)
print("accuracy at 0.5 threshold: ",accuracy_score(test_y,pred))

pred=(pred_prob[:,1] > 0.65)
print("accuracy at 0.6 threshold: ",accuracy_score(test_y,pred))

accuracy at 0.5 threshold:  0.528
accuracy at 0.6 threshold:  0.548


In [10]:
glm_binom = sm.GLM(train_y, train_X, family=sm.families.Binomial())
res = glm_binom.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 target   No. Observations:                 1000
Model:                            GLM   Df Residuals:                      949
Model Family:                Binomial   Df Model:                           50
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -669.98
Date:                Mon, 12 Oct 2020   Deviance:                       1340.0
Time:                        12:10:59   Pearson chi2:                 1.00e+03
No. Iterations:                     4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0291      0.068      0.427      0.6

# Linear Discriminant Analysis  
线性判别分析

In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [12]:
clf_lda = LinearDiscriminantAnalysis()
clf_lda.fit(train_X,train_y)
pred=clf_lda.predict(test_X) 
# get the probility
pred_prob=clf_lda.predict_proba(test_X) 
# save the prediction
pred_lda_prob_df=pd.DataFrame(pred_prob,columns=["pred0","pred1"])
pred_lda_prob_df.to_csv("lda.csv",index=False)

In [13]:
confusion_matrix(test_y,pred)

array([[83, 58],
       [60, 49]], dtype=int64)

In [14]:
pred=(pred_prob[:,1] > 0.5)
print("accuracy at 0.5 threshold: ",accuracy_score(test_y,pred))

pred=(pred_prob[:,1] > 0.6)
print("accuracy at 0.6 threshold: ",accuracy_score(test_y,pred))

accuracy at 0.5 threshold:  0.528
accuracy at 0.6 threshold:  0.536


In [15]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [16]:
clf_qda = QuadraticDiscriminantAnalysis()
clf_qda.fit(train_X,train_y)
pred=clf_qda.predict(test_X) 
# get the probility
pred_prob=clf_qda.predict_proba(test_X) 
# save the prediction
pred_qda_prob_df=pd.DataFrame(pred_prob,columns=["pred0","pred1"])
pred_qda_prob_df.to_csv("qda.csv",index=False)

In [17]:
confusion_matrix(test_y,pred)

array([[80, 61],
       [63, 46]], dtype=int64)

In [18]:
pred=(pred_prob[:,1] > 0.5)
print("accuracy at 0.5 threshold: ",accuracy_score(test_y,pred))

pred=(pred_prob[:,1] > 0.8)
print("accuracy at 0.8 threshold: ",accuracy_score(test_y,pred))

accuracy at 0.5 threshold:  0.504
accuracy at 0.8 threshold:  0.54


# Regularization  
正则化

In [19]:
# Logistic Regression with L2 regularizer and regularization strength

clf_lasso= linear_model.LogisticRegression(penalty='l1',C=0.5,random_state=123)
clf_lasso.fit(train_X,train_y)
# get the actual prediction.
pred=clf_lasso.predict(test_X) 
# get the probility
pred_prob=clf_lasso.predict_proba(test_X) 
# save the prediction
pred_lasso_prob_df=pd.DataFrame(pred_prob,columns=["pred0","pred1"])
pred_lasso_prob_df.to_csv("lasso regression.csv",index=False)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

In [20]:
confusion_matrix(test_y,pred)

array([[99, 42],
       [73, 36]], dtype=int64)

In [21]:
pred=(pred_prob[:,1] > 0.5)
print("accuracy at 0.5 threshold: ",accuracy_score(test_y,pred))

pred=(pred_prob[:,1] > 0.65)
print("accuracy at 0.65 threshold: ",accuracy_score(test_y,pred))

accuracy at 0.5 threshold:  0.504
accuracy at 0.65 threshold:  0.512


In [23]:
# Logistic Regression and Impact of regularizer /regularizing strength
penalty_choice=['l2']
C_choice=[0.0001,0.001,0.01,0.1,1,10,100,1000]

for p in range(len(penalty_choice)):
    for c in range(len(C_choice)):
        clf_model= linear_model.LogisticRegression(penalty=penalty_choice[p],C=C_choice[c],random_state=123)
        clf_model.fit(train_X,train_y)
        pred_class=clf_model.predict(test_X)
        #print(confusion_matrix(test_y,pred_class))
        pred_prob=clf_model.predict_proba(test_X)
        print("Penalty = ",penalty_choice[p]," and C = %.2f"%(C_choice[c]), " --> accuracy = %.4f"%accuracy_score(test_y,pred_class))

Penalty =  l2  and C = 0.00  --> accuracy = 0.5640
Penalty =  l2  and C = 0.00  --> accuracy = 0.5440
Penalty =  l2  and C = 0.01  --> accuracy = 0.5080
Penalty =  l2  and C = 0.10  --> accuracy = 0.5280
Penalty =  l2  and C = 1.00  --> accuracy = 0.5280
Penalty =  l2  and C = 10.00  --> accuracy = 0.5280
Penalty =  l2  and C = 100.00  --> accuracy = 0.5280
Penalty =  l2  and C = 1000.00  --> accuracy = 0.5280
