In [1]:
import random
import numpy as np
import pandas as pd
import IPython.core.display as di
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor


In [2]:
random.seed(9001)
pd.options.mode.chained_assignment = None
pd.set_option('display.float_format', lambda x: '%.4f' % x)
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)


In [3]:
def importdataset(path):
    df = pd.read_csv(path)
    df1 = df.dropna()
    print ("size of dataset  :", df.shape[0])
    print ("size of features :", df.shape[1])
    print ("size of NaN      :", df.shape[0] - df1.shape[0])
    return df

def processing(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val).reshape(y_val.shape[0], 1)
    y_pred = model.predict(X_val).reshape(y_val.shape[0], 1)

    print (y_val.label.value_counts())
    print ('Misclassified samples: %d' % (y_val != y_pred).sum())
    print ('Accuracy          : %.4f' % ((y_val == y_pred).sum() / y_val.shape[0]))
    print ('Accuracy (sklearn): %.4f' % accuracy_score(y_val, y_pred))
    print ("========== confusion matrix ==========")
    print (confusion_matrix(y_val, y_pred, labels = [0,1,2], sample_weight = None))
    return y_pred


In [4]:
df = importdataset("train_clean.csv")
df.head()

size of dataset  : 2756003
size of features : 11
size of NaN      : 0


Unnamed: 0,lineID,day,pid,adFlag,availability,competitorPrice,price,revenue,diff_price,number,label
0,1,1,6570,0,2,14.6,16.89,0.0,2.29,0.0,0
1,2,1,14922,1,1,8.57,8.75,0.0,0.18,0.0,1
2,3,1,16382,0,1,14.77,16.06,0.0,1.29,0.0,1
3,4,1,1145,1,1,6.59,6.55,6.55,-0.04,1.0,2
4,5,1,3394,0,1,4.39,4.14,4.14,-0.25,1.0,2


In [5]:
item = importdataset("items_clean.csv")
item.head()

size of dataset  : 22035
size of features : 10
size of NaN      : 0


Unnamed: 0,pid,manufacturer,group,content,unit,pharmForm,genericProduct,salesIndex,category,rrp
0,1,1,529,80.0,7,135,0,40,3.0,10.89
1,2,1,529,80.0,7,135,0,40,3.0,10.89
2,3,1,529,10.0,1,45,0,40,3.0,16.45
3,4,1,529,80.0,7,135,0,40,3.0,10.89
4,5,2,74,8.0,7,112,0,40,1.0,22.53


In [6]:
sc = StandardScaler(with_mean=True, with_std=True)
def preprocessing(train, test):
    # X and y
    X = train[col]
    y = train[["label"]]
    X_test = test[col]
    y_test = test[["label"]]
    # StandardScaler
    X = pd.DataFrame(sc.fit_transform(X))
    X_test = pd.DataFrame(sc.fit_transform(X_test))
    # class_weight
    cw0 = y.label.value_counts()[0]/y.shape[0]
    cw1 = y.label.value_counts()[1]/y.shape[0]
    cw2 = y.label.value_counts()[2]/y.shape[0]
    print('Number of train            : %d' % train.shape[0])
    print('Number of test             : %d' % test.shape[0])
    print ("size of training features :", X.shape[1])
    print ("=== Distribution of label in validation ===")
    print ("class weight of label_0   :", cw0)
    print ("class weight of label_1   :", cw1)
    print ("class weight of label_2   :", cw2)
    
    return X, y, X_test, y_test, cw0, cw1, cw2 

def preprocessing_wb(train, test):
    # without basket
    train_ = train[(train.label != 1)]
    # X and y
    X = train_[col]
    y = train_[["label"]]
    X_test = test[col]
    y_test = test[["label"]]
    # StandardScaler
    X = pd.DataFrame(sc.fit_transform(X))
    X_test = pd.DataFrame(sc.fit_transform(X_test))
    # class_weight
    cw0 = y.label.value_counts()[0]/y.shape[0]
    cw2 = y.label.value_counts()[2]/y.shape[0]
    print('Number of train            : %d' % train.shape[0])
    print('Number of test             : %d' % test.shape[0])
    print ("size of training features :", X.shape[1])
    print ("=== Distribution of label in validation ===")
    print ("class weight of label_0   :", cw0)
    print ("class weight of label_2   :", cw2)
    
    return X, y, X_test, y_test, cw0, cw2 


### 利用pid的資訊合併建模

In [7]:
df_ = pd.merge(df, item, how='inner', on=['pid']).sort_values(['lineID'], ascending = True).reset_index(drop = True)

train = df_[(df_.day <= 62)].sort_values(['lineID'], ascending = True).reset_index(drop = True)
test = df_[(df_.day > 62)].sort_values(['lineID'], ascending = True).reset_index(drop = True)

col = ['adFlag','availability','competitorPrice','price','manufacturer','group','content','unit',
       'pharmForm','genericProduct','salesIndex','category','rrp']
X, y, X_test,y_test, cw0, cw1, cw2 = preprocessing(train, test)


Number of train            : 1782748
Number of test             : 973255
size of training features : 13
=== Distribution of label in validation ===
class weight of label_0   : 0.562037371519
class weight of label_1   : 0.17458496658
class weight of label_2   : 0.263377661902


In [8]:
tree = DecisionTreeClassifier(criterion = 'entropy', 
                              max_depth = 5, 
                              random_state = 9001,
                              class_weight = {0:cw0, 1:cw1, 2:cw2})
pred_tree = processing(tree, X, y, X_test, y_test)

0    580856
2    235554
1    156845
Name: label, dtype: int64
Misclassified samples: 392201
Accuracy          : 0.5970
Accuracy (sklearn): 0.5970
[[579882    584    390]
 [156080    236    529]
 [234247    371    936]]


### without basket

In [9]:
X, y, X_test, y_test, cw0, cw2 = preprocessing_wb(train, test)

Number of train            : 1782748
Number of test             : 973255
size of training features : 13
=== Distribution of label in validation ===
class weight of label_0   : 0.680914871625
class weight of label_2   : 0.319085128375


In [10]:
tree = DecisionTreeClassifier(criterion = 'entropy', 
                              max_depth = 5, 
                              random_state = 9001,
                              class_weight = {0:cw0, 2:cw2})
pred_tree = processing(tree, X, y, X_test, y_test)

0    580856
2    235554
1    156845
Name: label, dtype: int64
Misclassified samples: 387902
Accuracy          : 0.6014
Accuracy (sklearn): 0.6014
[[578489      0   2367]
 [153248      0   3597]
 [228690      0   6864]]


(578276+152756+7814)/973255 = 0.7591  
將basket的視為不會購買，正確率提高。

In [11]:
#sub = pd.concat([test.lineID, pd.DataFrame(pred_tree, columns=["label"])], axis=1)
#sub.label.value_counts()

#### prediction of class dataset

In [12]:
#df_class = pd.read_csv("class_clean.csv")
#df_class = pd.merge(df_class, item, how='inner', on=['pid']).sort_values(['lineID'], ascending = True).reset_index(drop = True)

#df_class = df_class[col]
#pred_class = tree.predict(df_class)
#pred_class = pd.DataFrame(pred_class, columns=["pred"])
#pred_class.pred.value_counts()

### 利用我建立的pid變數去建模

In [13]:
pid = pd.read_csv("pid_info_train.csv")
pid.head()

Unnamed: 0,pid,day_max,day_min,day_mean,pid_count,ad_mean,avail_mean,competitor_max,competitor_min,competitor_mean,...,price_mean,diff_max,diff_min,diff_mean,rev_mean,number_max,number_min,number_mean,pid_sum,pid_prefer
0,1,59,10,36.5,8,0.0,3.0,8.79,8.22,8.41,...,9.1025,1.86,-0.94,0.6925,0.91,1.0,0.0,0.125,1.0,3
1,2,41,12,31.0,3,0.0,3.0,10.08,8.42,9.5267,...,9.5267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3,41,1,13.0,4,0.0,3.0,15.21,15.21,15.21,...,15.21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,4,60,2,32.6667,9,0.0,1.6667,8.71,8.22,8.4889,...,9.9967,1.86,1.09,1.5078,3.36,1.0,0.0,0.3333,3.0,7
4,5,59,8,38.2632,19,0.0,2.0,18.06,17.59,17.8947,...,18.2242,3.24,-3.32,0.3295,0.0,0.0,0.0,0.0,0.0,0


In [14]:
df_ = pd.merge(df, pid, how='inner', on=['pid']).sort_values(['lineID'], ascending = True).reset_index(drop = True)

train = df_[(df_.day <= 62)].sort_values(['lineID'], ascending = True).reset_index(drop = True)
test = df_[(df_.day > 62)].sort_values(['lineID'], ascending = True).reset_index(drop = True)

col = ['day', 'pid', 'adFlag', 'availability', 'competitorPrice', 'price', 'diff_price',
       'day_max', 'day_min', 'day_mean', 'pid_count', 'ad_mean', 'avail_mean',
       'competitor_max', 'competitor_min', 'competitor_mean', 'price_max', 'price_min',
       'price_mean', 'diff_max', 'diff_min', 'diff_mean', 'rev_mean', 'number_max', 'number_min',
       'number_mean', 'pid_sum', 'pid_prefer']
X, y, X_test, y_test, cw0, cw1, cw2 = preprocessing(train, test)


Number of train            : 1782748
Number of test             : 972377
size of training features : 28
=== Distribution of label in validation ===
class weight of label_0   : 0.562037371519
class weight of label_1   : 0.17458496658
class weight of label_2   : 0.263377661902


In [15]:
tree = DecisionTreeClassifier(criterion = 'entropy', 
                              max_depth = 5, 
                              random_state = 9001,
                              class_weight = {0:cw0, 1:cw1, 2:cw2})
pred_tree = processing(tree, X, y, X_test, y_test)

0    580269
2    235402
1    156706
Name: label, dtype: int64
Misclassified samples: 371742
Accuracy          : 0.6177
Accuracy (sklearn): 0.6177
[[556856   2089  21324]
 [125075  17089  14542]
 [207974    738  26690]]


In [16]:
# without basket
tree = DecisionTreeClassifier(criterion = 'entropy', 
                              max_depth = 5, 
                              random_state = 9001,
                              class_weight = {0:cw0, 2:cw2})
X, y, X_test, y_test, cw0, cw2 = preprocessing_wb(train, test)
pred_tree = processing(tree, X, y, X_test, y_test)

Number of train            : 1782748
Number of test             : 972377
size of training features : 28
=== Distribution of label in validation ===
class weight of label_0   : 0.680914871625
class weight of label_2   : 0.319085128375
0    580269
2    235402
1    156706
Name: label, dtype: int64
Misclassified samples: 389892
Accuracy          : 0.5990
Accuracy (sklearn): 0.5990
[[555384      0  24885]
 [141932      0  14774]
 [208301      0  27101]]


(555384+141932+27101)/972377 = 74.50  
從正確率來看的話雖然差不多，但是在分群的比例來看卻合理許多。

In [None]:
df_ = pd.merge(df, pid, how='inner', on=['pid']).sort_values(['lineID'], ascending = True).reset_index(drop = True)
df_ = pd.merge(df_, item, how='inner', on=['pid']).sort_values(['lineID'], ascending = True).reset_index(drop = True)

train = df_[(df_.day <= 62)].sort_values(['lineID'], ascending = True).reset_index(drop = True)
test = df_[(df_.day > 62)].sort_values(['lineID'], ascending = True).reset_index(drop = True)

col = ['day', 'pid', 'adFlag', 'availability', 'competitorPrice', 'price', 'diff_price',
       'day_max', 'day_min', 'day_mean', 'pid_count', 'ad_mean', 'avail_mean',
       'competitor_max', 'competitor_min', 'competitor_mean', 'price_max', 'price_min',
       'price_mean', 'diff_max', 'diff_min', 'diff_mean', 'rev_mean', 'number_max', 'number_min',
       'number_mean', 'pid_sum', 'pid_prefer','manufacturer','group','content','unit',
       'pharmForm','genericProduct','salesIndex','category','rrp']
X, y, X_test, y_test, cw0, cw1, cw2 = preprocessing(train, test)
