## Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import os
path = ''
# 自定义数据的header
pdData = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
# 数据的维度
pdData.shape

In [None]:
# 获取正负样本的数据
positive = pdData[pdData['Admitted'] == 1]
negative = pdData[pdData['Admitted'] == 0]

# 散点图正负样本数据情况
fig, ax = plt.subplots(figsize=(10, 5))
ax.sactter(positive['Exam 1'], positive['Exam 2'], s=30, c='b', marker='o', label='Admitted')
ax.sactter(negative['Exam 1'], negative['Exam 2'], s=30, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')

### The Logistic Regression
目标：建立分类器（求解出三个参数θ1，2，3）

设定阈值，根据阈值判定录取结果

要完成的模块：
* sigmoid:映射到概率的函数
* model:返回预测结果值
* cost:根据参数计算损失
* gradient:计算每个参数的梯度方向
* descent:进行参数更新
* accuracy:计算精度

## 案例分析

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('creditcard.csv')
data.head()

count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
count_classes.plot(kind='bar')
plot.title('Fruad class histogram')
plot.xlabel('Class')
plot.ylabel('Frequency')

### 解决样本不均衡问题
* 下采样:从数据量多的样本中抽取部分样本，同样少，有点是recall高但是误杀率也很高
* 过采样:生成策略，同样多

In [None]:
# 为了使每个特征重要程度相当，需要对数据做归一化或正则化
from sklearn.preprocessing import StandarScaler
# fit_transform对数据原特征进行变换得到新的特征
data['normAmount'] = StandarScaler().fit_transform(data['Amount']).reshape(-1, 1)
# 舍弃不需要的特征
data = data.drop(['Time', 'Amount'], axis=1)
data.head()

### 下采样策略

In [None]:
# 数据切分成特征和标签
X = data.iX[:, data.columns != 'Class']
Y = data.iY[:, data.columns == 'Class']

number_records_fraud = len(data[data.Class == 1])
# 获取符合条件的数据索引
fraud_indices = np.array(data[data.Class == 1].index)
normal_indices = data[data.Class == 0].index

# 对数据进行下采样并获取索引值
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=True)
random_normal_indices = np.array(random_normal_indices)

# 合并数据
under_sample_indeces = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indeces, :]

X_undersample = under_sample_data.iX[:, under_sample_data.columns != 'Class']
Y_undersample = under_sample_data.iY[:, under_sample_data.columns == 'Class']

### 交叉验证
recall = TP / (TP + FN)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score

# random_state可以复现
# 对整个数据集进行切分
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
# 对进行下采样过的数据进行切分
# 得到的测试集比较少而且不具备原有数据的分布情况
x_train_undersample, x_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample, Y_undersample, test_size=0.3, random_state=0)

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, classification_report
from sklearn.cross_validation import cross_val_score, KFold

ModuleNotFoundError: No module named 'sklearn.cross_validation'

In [None]:
def printing_Kfold_scores(x_train, y_train):
    fold = KFold(len(y_train), 5, shuffle=False)
    # 不同的正则化参数
    c_param_range = [0.01, 0.1, 1, 10, 100]
    
    result_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])
    result_table['C_parameter'] = c_param_range
    j = 0
    for c_param in c_param_range:
        reclass_accs = []
        for iteration, indices in enumerate(fold, start=1):
            # L1惩罚项
            lr = LogisticRegression(C=c_param, penalty='ll')
            # 模型构建
            lr.fit(x_train.iloc[indices[0], :], y_train.iloc[indices[1], :].values.ravel())
            # 模型预测
            y_pred_undersample = lr.predict(x_train[indices[1], :].values)
            recall_acc = recall_score(y_train.iloc[indices[1], :].values, y_pred_undersample)
            reclass_accs.append(recall_acc)
            print('Iteration', iteration, ':recall score=', recall_acc)

        result_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
        j + =1
    best_c = result_table.loc[result_table['Mean recall score'].idxmax()]['C_parameter']
    return best_c

### 过采样
SMOTE样本生成策略

In [None]:
import pandas as pd
from imblearn.over_sampling import SOMTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_sample(x_train, y_train)

os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_KFold_scores(os_features, os_labels)


### 总结
在数据样本不均衡的情况下，为了衡量recall和accuracy，采用过采样的策略也许能够得到较好的效果