In [134]:
import pandas as pd
import numpy as np
import sklearn
import pickle
from sklearn import preprocessing
from sklearn import model_selection as ms
from sklearn import linear_model

In [135]:
# Đọc file csv, xóa cột số thứ tự và cột AveMonthSpend
csv = pd.read_csv('CSV.csv').drop(['Unnamed: 0','AveMonthSpend','CustomerID'], axis=1)
#csv = pd.read_csv('CSV.csv').drop(['Unnamed: 0'], axis=1)
csv.head()

Unnamed: 0,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer,Age
0,Bachelors,Professional,M,M,1,0,0,2,137947,0,2545
1,Bachelors,Professional,M,S,0,1,3,3,101141,1,2545
2,Bachelors,Professional,M,M,1,1,3,3,91945,0,2545
3,Bachelors,Professional,F,S,0,1,0,0,86688,0,2545
4,Bachelors,Professional,F,S,1,4,5,5,92771,1,2545


In [136]:
# Kiểm tra sự cân bằng của cột bike buyer
# Theo số liệu, ta có thể thấy số lượng những người chưa mua xe lớn xấp xỉ gấp đôi 
# số lượng người không mua xe, do đó phải điều chỉnh các siêu tham số để tăng độ chính xác
csv['BikeBuyer'].value_counts()

0    10949
1     5455
Name: BikeBuyer, dtype: int64

In [137]:
# Tạo label
labels = np.array(csv['BikeBuyer'])

In [138]:
# Chuyển các dữ liệu dạng biến phân loại về dạng binary dummy, sử dụng one hot code 
def encode_string(cat_feature, name_col):
    enc = preprocessing.LabelEncoder()
    enc = enc.fit(cat_feature)
    pickle.dump(enc, open('encode' + name_col + '.sav', 'wb'))  
    enc_cat_feature = enc.transform(cat_feature)
    
    ohe = preprocessing.OneHotEncoder(categories='auto')
    return ohe.fit_transform(enc_cat_feature.reshape(-1,1)).toarray()

col_to_encodes = ['Occupation','Gender','MaritalStatus',
                  'HomeOwnerFlag','Age']
features = encode_string(csv['Education'], 'Education')
for col in col_to_encodes:
    tmp = encode_string(csv[col], col)
    features = np.concatenate([features, tmp], axis = 1)
print (features.shape)

(16404, 20)


In [139]:
# Thêm các giá trị số vào features 
features = np.concatenate([features, np.array(csv[['NumberCarsOwned','NumberChildrenAtHome',
                                                   'TotalChildren','YearlyIncome']])], axis = 1)
print (features.shape)

(16404, 24)


In [140]:
# Tách thành 2 tập train và test
indx = range(features.shape[0])
indx = ms.train_test_split(indx, test_size = 4000)
x_train = features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [143]:
# Chuẩn hóa z-score cho các trường
zscore = preprocessing.StandardScaler()
x_train[:,20:] = zscore.fit_transform(x_train[:,20:])
x_test[:,20:] = zscore.fit_transform(x_test[:,20:])

In [149]:
# Áp dụng mô hình logistic 
linear_mode = linear_model.LogisticRegression()
linear_mode.fit(x_train, y_train)
pickle.dump(linear_mode, open('final_classification.sav', 'wb'))  



In [150]:
score_proba = linear_mode.predict_proba(x_test)
score = []
for x in score_proba: 
    if x[0] > 0.6: score += [0]
    else: score += [1]
print (score)

[1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 

In [151]:
from sklearn import metrics as sklm
accuracy = sklm.accuracy_score(y_test, score)
print (accuracy)

0.795


In [152]:
labels = labels.reshape(labels.shape[0],)
scoring = ['precision_macro', 'recall_macro', 'accuracy']
logistic_mod = linear_model.LogisticRegression(C = 1.0, class_weight = {0:0.4, 1:0.6}) 
scores = ms.cross_validate(logistic_mod, features, labels, scoring=scoring,
                        cv=10, return_train_score=False)

def print_format(f,x,y,z):
    print('Fold %2d    %4.3f        %4.3f      %4.3f' % (f, x, y, z))

def print_cv(scores):
    fold = [x + 1 for x in range(len(scores['test_precision_macro']))]
    print('         Precision     Recall       AUC')
    [print_format(f,x,y,z) for f,x,y,z in zip(fold, scores['test_precision_macro'], 
                                          scores['test_recall_macro'],
                                          scores['test_accuracy'])]
    print('-' * 40)
    print('Mean       %4.3f        %4.3f      %4.3f' % 
          (np.mean(scores['test_precision_macro']), np.mean(scores['test_recall_macro']), np.mean(scores['test_accuracy'])))  
    print('Std        %4.3f        %4.3f      %4.3f' % 
          (np.std(scores['test_precision_macro']), np.std(scores['test_recall_macro']), np.std(scores['test_accuracy'])))

print_cv(scores)   



         Precision     Recall       AUC
Fold  1    0.749        0.745      0.777
Fold  2    0.758        0.761      0.785
Fold  3    0.750        0.750      0.778
Fold  4    0.756        0.757      0.784
Fold  5    0.746        0.751      0.774
Fold  6    0.765        0.763      0.791
Fold  7    0.760        0.762      0.787
Fold  8    0.763        0.754      0.789
Fold  9    0.742        0.738      0.771
Fold 10    0.748        0.749      0.777
----------------------------------------
Mean       0.754        0.753      0.781
Std        0.007        0.007      0.007


In [121]:
point = pd.DataFrame(score)
print (point.shape)
point.to_csv('KQ.csv')

(4000, 1)
