### 分类算法
该算法能根据Red Hat的特征和活动准确识别哪些客户具有最大的潜在商业价值<br>
https://www.kaggle.com/c/predicting-red-hat-business-value

逻辑回归

In [1]:
# read the data
import pandas as pd
import numpy as np
dir = 'data/01_redHat/'
df_people = pd.read_csv(dir+'people.csv')
df_act_train = pd.read_csv(dir+'act_train.csv')

### 整理数据<br>
将数据中缺失值很多的列删掉后再合并两表

In [2]:
# 删除act_train表缺失值太多的维度 
x = ['date','char_1','char_2','char_3','char_4','char_5','char_6','char_7','char_8','char_9']
df_act_train.drop(x, axis=1,inplace=True)
#df = pd.merge(df_people,df_act_train, on='people_id')

In [3]:
df_act_train.iloc[0] # 删除完后

people_id                 ppl_100
activity_id          act2_1734928
activity_category          type 4
char_10                   type 76
outcome                         0
Name: 0, dtype: object

In [4]:
df_act_train.columns

Index(['people_id', 'activity_id', 'activity_category', 'char_10', 'outcome'], dtype='object')

In [5]:
# 删除people表中缺失值太多数据
df_people.drop(['date'], axis=1, inplace=True)

In [6]:
# join people表和act_train表
df = pd.merge(df_people,df_act_train, on='people_id')
# 删除对模型无用维度：people id, group_1, activity id
df.drop(['people_id', 'group_1', 'activity_id'], axis=1, inplace=True)

In [7]:
df.shape

(2197291, 41)

### 预处理数据
缺失值填充、连续值归一化、类别值转换

In [8]:
# 缺失值处理
df.isnull().any()

char_1               False
char_2               False
char_3               False
char_4               False
char_5               False
char_6               False
char_7               False
char_8               False
char_9               False
char_10_x            False
char_11              False
char_12              False
char_13              False
char_14              False
char_15              False
char_16              False
char_17              False
char_18              False
char_19              False
char_20              False
char_21              False
char_22              False
char_23              False
char_24              False
char_25              False
char_26              False
char_27              False
char_28              False
char_29              False
char_30              False
char_31              False
char_32              False
char_33              False
char_34              False
char_35              False
char_36              False
char_37              False
c

In [9]:
# 发现char_10_y有缺失值
#df.dropna(how="all") # 将行全为NaN的删除
df.fillna(method='pad',inplace=True)

In [10]:
df.dtypes

char_1               object
char_2               object
char_3               object
char_4               object
char_5               object
char_6               object
char_7               object
char_8               object
char_9               object
char_10_x              bool
char_11                bool
char_12                bool
char_13                bool
char_14                bool
char_15                bool
char_16                bool
char_17                bool
char_18                bool
char_19                bool
char_20                bool
char_21                bool
char_22                bool
char_23                bool
char_24                bool
char_25                bool
char_26                bool
char_27                bool
char_28                bool
char_29                bool
char_30                bool
char_31                bool
char_32                bool
char_33                bool
char_34                bool
char_35                bool
char_36             

In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# 将每一列都转换为categorical，除了char_38是连续值
for i in df.columns:
    if i != 'char_38':
        #df[i] = df[i].astype('category')
        df[i] = le.fit_transform(df[i])# 注意要先label encoder一下，不然会报错：string不能转换
        df[i] = pd.Categorical(df[i])
# 将char_38归一化
max_min_scaler = lambda x:(x-np.min(x))/(np.max(x)-np.min(x))
df[['char_38']].apply(max_min_scaler)

df.dtypes

char_1               category
char_2               category
char_3               category
char_4               category
char_5               category
char_6               category
char_7               category
char_8               category
char_9               category
char_10_x            category
char_11              category
char_12              category
char_13              category
char_14              category
char_15              category
char_16              category
char_17              category
char_18              category
char_19              category
char_20              category
char_21              category
char_22              category
char_23              category
char_24              category
char_25              category
char_26              category
char_27              category
char_28              category
char_29              category
char_30              category
char_31              category
char_32              category
char_33              category
char_34   

### 切分训练、测试数据集

In [15]:
# 将特征X和标签y分开
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

# 划分训练数据集和测试数据集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

### 拟合模型——逻辑回归

In [16]:
from sklearn.linear_model import LogisticRegression
# C是正则化的力度，1e4=10000
# solver是用于优化的算法：‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’
logreg = LogisticRegression(C=1e4, solver='lbfgs')
logreg.fit(x_train, y_train)

LogisticRegression(C=10000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [17]:
# 获得模型的参数
logreg.get_params()

{'C': 10000.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [18]:
# 为模型打分
logreg.score(x_test, y_test)

0.8212164056384521

In [19]:
# 交叉验证
from sklearn.model_selection import cross_val_score
cross_val_score(logreg, x_train, y_train, cv=3, scoring='accuracy')

array([0.82046881, 0.82124474, 0.82094597])