### 分类算法
该算法能根据Red Hat的特征和活动准确识别哪些客户具有最大的潜在商业价值<br>
https://www.kaggle.com/c/predicting-red-hat-business-value

逻辑回归

In [1]:
# read the data
import pandas as pd
import numpy as np
dir = 'data/01_redHat/'
df_people = pd.read_csv(dir+'people.csv')
df_act_train = pd.read_csv(dir+'act_train.csv')

### 理解数据

算法第一步，深入理解所得的数据

In [2]:
df_people.head(2)

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76


In [3]:
df_act_train.head(2)

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0


### 整理数据<br>
将数据中缺失值很多的列删掉后再合并两表

In [4]:
# 删除act_train表缺失值太多的维度 
x = ['date','char_1','char_2','char_3','char_4','char_5','char_6','char_7','char_8','char_9']

df_act_train.drop(x, axis=1,inplace=True)

df_act_train.iloc[0] # 删除完后
# df_act_train.columns

people_id                 ppl_100
activity_id          act2_1734928
activity_category          type 4
char_10                   type 76
outcome                         0
Name: 0, dtype: object

In [5]:
# 删除people表中缺失值太多数据
df_people.drop(['date'], axis=1, inplace=True)

In [6]:
# join people表和act_train表
df = pd.merge(df_people,df_act_train, on='people_id')

# 删除对模型无用维度：people id, group_1, activity id
df.drop(['people_id', 'group_1', 'activity_id'], axis=1, inplace=True)

df.shape

(2197291, 41)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2197291 entries, 0 to 2197290
Data columns (total 41 columns):
char_1               object
char_2               object
char_3               object
char_4               object
char_5               object
char_6               object
char_7               object
char_8               object
char_9               object
char_10_x            bool
char_11              bool
char_12              bool
char_13              bool
char_14              bool
char_15              bool
char_16              bool
char_17              bool
char_18              bool
char_19              bool
char_20              bool
char_21              bool
char_22              bool
char_23              bool
char_24              bool
char_25              bool
char_26              bool
char_27              bool
char_28              bool
char_29              bool
char_30              bool
char_31              bool
char_32              bool
char_33              bool
char_34    

In [8]:
df.describe() # describe功能似乎只对连续数值有效

Unnamed: 0,char_38,outcome
count,2197291.0,2197291.0
mean,49.98051,0.4439544
std,36.08557,0.4968491
min,0.0,0.0
25%,0.0,0.0
50%,59.0,0.0
75%,82.0,1.0
max,100.0,1.0


### 预处理数据
缺失值填充、连续值归一化、类别值转换<br>
标准化 模块提供一个类StandardScaler

##### 空值的填充

In [9]:
# 缺失值处理
df.isnull().any()

char_1               False
char_2               False
char_3               False
char_4               False
char_5               False
char_6               False
char_7               False
char_8               False
char_9               False
char_10_x            False
char_11              False
char_12              False
char_13              False
char_14              False
char_15              False
char_16              False
char_17              False
char_18              False
char_19              False
char_20              False
char_21              False
char_22              False
char_23              False
char_24              False
char_25              False
char_26              False
char_27              False
char_28              False
char_29              False
char_30              False
char_31              False
char_32              False
char_33              False
char_34              False
char_35              False
char_36              False
char_37              False
c

In [10]:
# 发现char_10_y有缺失值

#df.dropna(how="all") # 将行全为NaN的删除

df['char_10_y']=df['char_10_y'].fillna(method='pad')

df.dtypes

char_1               object
char_2               object
char_3               object
char_4               object
char_5               object
char_6               object
char_7               object
char_8               object
char_9               object
char_10_x              bool
char_11                bool
char_12                bool
char_13                bool
char_14                bool
char_15                bool
char_16                bool
char_17                bool
char_18                bool
char_19                bool
char_20                bool
char_21                bool
char_22                bool
char_23                bool
char_24                bool
char_25                bool
char_26                bool
char_27                bool
char_28                bool
char_29                bool
char_30                bool
char_31                bool
char_32                bool
char_33                bool
char_34                bool
char_35                bool
char_36             

In [11]:
df_test = df.copy() # 对比数据,standardScaler效果查看

#####  MinMaxScaler

In [12]:
df['char_38'].head(2)

0    36
1    36
Name: char_38, dtype: int64

In [13]:
print(df['char_38'].max())
print(df['char_38'].min())

100
0


In [14]:
from sklearn import preprocessing
# 将char_38归一化
# max_min_scaler = lambda x:(x-np.min(x))/(np.max(x)-np.min(x))
# df[['char_38']].apply(max_min_scaler)

minmax = preprocessing.MinMaxScaler() # 引入归一化类

minmax.fit(df[['char_38']])
df['char_38'] = minmax.transform(df[['char_38']])

  return self.partial_fit(X, y)


In [15]:
df['char_38'].head(2)

0    0.36
1    0.36
Name: char_38, dtype: float64

#####  StandardScaler

In [16]:
# 将char_38用standardScaler归一化
scaler = preprocessing.StandardScaler() #引入标准化类

scaler.fit(df_test[['char_38']])

scaler.mean_ # 标准化前的均值

scaler.scale_ # 标准化前的标准差

scaler.var_ # 标准化前的方差

standard_data = pd.DataFrame(scaler.transform(df_test[['char_38']]))

standard_data.mean() # 归一化后的数据均值

standard_data.std() # 归一化后的方差

# 恢复数据操作
# scaler.inverse_transform(df[['char_38']])

  return self.partial_fit(X, y)
  if sys.path[0] == '':


0    1.0
dtype: float64

In [17]:
df_test['char_38'].head(2)

0    36
1    36
Name: char_38, dtype: int64

In [18]:
standard_data.head(2)

Unnamed: 0,0
0,-0.387427
1,-0.387427


In [19]:
standard_data.describe()

Unnamed: 0,0
count,2197291.0
mean,-1.5625340000000002e-17
std,1.0
min,-1.385056
25%,-1.385056
50%,0.2499473
75%,0.8873214
max,1.386136


In [20]:
df_test['char_38'] = standard_data

#####  分类特征处理

In [21]:
df.head(4)

Unnamed: 0,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10_x,...,char_32,char_33,char_34,char_35,char_36,char_37,char_38,activity_category,char_10_y,outcome
0,type 2,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,...,False,False,True,True,True,False,0.36,type 4,type 76,0
1,type 2,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,...,False,False,True,True,True,False,0.36,type 2,type 1,0
2,type 2,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,...,False,False,True,True,True,False,0.36,type 2,type 1,0
3,type 2,type 2,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,...,False,False,True,True,True,False,0.36,type 2,type 1,0


**LabelEncoder**

In [22]:
le = preprocessing.LabelEncoder() # 引入LabelEncoder

# # 将每一列都转换为categorical，除了char_38是连续值
# for i in df.columns:
#     if i != 'char_38':
#         #df[i] = df[i].astype('category')
#         df[i] = le.fit_transform(df[i])# 注意要先label encoder一下，不然会报错：string不能转换
#         df[i] = pd.Categorical(df[i])
# df.dtypes

# char_10_y le 操作报错，暂时先删掉进行后续步骤
#df.drop(['char_10_y'], axis=1, inplace=True)

for i in df.columns:
    if i != 'char_38':
        df[i] = le.fit_transform(df[i])
        print(df[i].unique())

[1 0]
[1 2 0]
[38 20 33 34  5 40  1 39  2  0 11 42 41  3 13  8 19 21  6 10  7  9 22 18
 24 12 15 16 32 23  4 31 28 25 14 17 29 26 30 27 35 37 36]
[20 24 23 17 21 22  1  3 11  0 18 19  6  4  9  7 10  5  2 13 14  8 15 16
 12]
[4 8 3 7 6 5 1 0 2]
[2 1 0 3 5 4 6]
[ 2 20 23 22 24 12 11  8 15  0 21 18  4  1  5 13  6 16  9  3 14 19  7 10
 17]
[1 2 5 7 4 3 6 0]
[1 3 2 5 7 4 8 6 0]
[1 0]
[0 1]
[0 1]
[1 0]
[1 0]
[0 1]
[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[1 0]
[0 1]
[1 0]
[1 0]
[0 1]
[0 1]
[1 0]
[1 0]
[1 0]
[0 1]
[3 1 2 4 0 6 5]
[5382    0  762 ... 5147 4695 5172]
[0 1]


**One-hot**<br>

如果sklearn版本不够高时，会报错

In [23]:
enc = preprocessing.OneHotEncoder() # 引入onehot编码类

enc.fit(df_test[['char_1','char_2']])

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

### 切分训练、测试数据集

In [25]:
# 将特征X和标签y分开
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

# 划分训练数据集和测试数据集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

### 拟合模型——逻辑回归

In [26]:
from sklearn.linear_model import LogisticRegression
# C是正则化的力度，1e4=10000
# solver是用于优化的算法：‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’
logreg = LogisticRegression(C=1e3, solver='liblinear')
logreg.fit(x_train, y_train)

# 注：试了newton-cg, lbfgs都不能收敛，但模型得到的分数都没啥区别？？
# 改变C似乎也没影响模型的分数？？
# C=1e4,score= 0.836

LogisticRegression(C=1000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [27]:
# 获得模型的参数
logreg.get_params()

{'C': 1000.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [28]:
# 为模型打分
logreg.score(x_test, y_test)

0.8364063059400353

### 交叉验证

In [29]:
# 交叉验证
from sklearn.model_selection import cross_val_score
cross_val_score(logreg, x, y, cv=5, scoring='accuracy')

array([0.84547136, 0.8113931 , 0.84752809, 0.85475518, 0.81848736])

In [30]:
# ROC曲线
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import auc
# 预测结果
y_predict = logreg.predict(x_test)#分类
y_predict_prob = logreg.predict_proba(x_test)#概率
print(y_predict)
print(y_predict_prob)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict_prob[:,0], pos_label=2)
print(fpr)
print(tpr)

#画图
plt.plot(fpr, tpr, marker = 'o')
plt.show()

[1 0 1 ... 1 0 0]
[[3.50221613e-01 6.49778387e-01]
 [9.99116805e-01 8.83195457e-04]
 [1.40942012e-01 8.59057988e-01]
 ...
 [1.58202464e-01 8.41797536e-01]
 [9.98700819e-01 1.29918076e-03]
 [9.98703677e-01 1.29632323e-03]]




[0.00000000e+00 3.03403581e-06 9.10210744e-06 ... 9.99986347e-01
 9.99990898e-01 1.00000000e+00]
[nan nan nan ... nan nan nan]


<Figure size 640x480 with 1 Axes>

为什么tpr都是nan？？？没有真正例吗？？

In [14]:
# 求AUC
from sklearn.metrics import auc
AUC = auc(fpr, tpr)