# import packages

In [46]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO   
from sklearn.tree import export_graphviz
import pydotplus

# Data preprocessing:

read data

source link:https://www.kaggle.com/mehdidag/black-friday

In [47]:
df = pd.read_csv('BlackFriday.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


Drop relatively useless attributes

In [48]:
df = df.drop('User_ID', axis = 1)
df = df.drop('Product_ID', axis = 1)

Deal with NaN attributes

In [49]:
df.isnull().sum()

Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            166986
Product_Category_3            373299
Purchase                           0
dtype: int64

just fill 0 into NaN

In [50]:
p2 = df['Product_Category_2']
df['Product_Category_2'].fillna('0',inplace = True)

In [51]:
p3 = df['Product_Category_3']
df['Product_Category_3'].fillna('0',inplace = True)

In [52]:
df.isnull().sum()

Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [53]:
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,F,0-17,10,A,2,0,3,0,0,8370
1,F,0-17,10,A,2,0,1,6,14,15200
2,F,0-17,10,A,2,0,12,0,0,1422
3,F,0-17,10,A,2,0,12,14,0,1057
4,M,55+,16,C,4+,0,8,0,0,7969


Deal with the data: Gender

F = 1, M = 0

In [54]:
gender_map = {'M' : 0, 'F' :1}
df['Gender'] = df['Gender'].map(gender_map)
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,0-17,10,A,2,0,3,0,0,8370
1,1,0-17,10,A,2,0,1,6,14,15200
2,1,0-17,10,A,2,0,12,0,0,1422
3,1,0-17,10,A,2,0,12,14,0,1057
4,0,55+,16,C,4+,0,8,0,0,7969


In [55]:
df.Gender.value_counts()

0    405380
1    132197
Name: Gender, dtype: int64

Deal with the data: City_Category

In [56]:
city_map = {'A' : 1, 'B' :2, 'C' :3}
df['City_Category'] = df['City_Category'].map(city_map)
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,0-17,10,1,2,0,3,0,0,8370
1,1,0-17,10,1,2,0,1,6,14,15200
2,1,0-17,10,1,2,0,12,0,0,1422
3,1,0-17,10,1,2,0,12,14,0,1057
4,0,55+,16,3,4+,0,8,0,0,7969


Deal with the data: Age

In [57]:
age_map = {'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7}
df['Age'] = df['Age'].map(age_map)
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,1,10,1,2,0,3,0,0,8370
1,1,1,10,1,2,0,1,6,14,15200
2,1,1,10,1,2,0,12,0,0,1422
3,1,1,10,1,2,0,12,14,0,1057
4,0,7,16,3,4+,0,8,0,0,7969


Deal with the data: Product_Category_2

In [58]:
df[['Product_Category_2','Product_Category_3']]= df[['Product_Category_2','Product_Category_3']].apply(pd.to_numeric)

Deal with the data: Stay_In_Current_City_Years

In [59]:
year_map = {'0':0,'1':1,'2':2,'3':3,'4+':4}
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].map(year_map)
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,1,10,1,2,0,3,0.0,0.0,8370
1,1,1,10,1,2,0,1,6.0,14.0,15200
2,1,1,10,1,2,0,12,0.0,0.0,1422
3,1,1,10,1,2,0,12,14.0,0.0,1057
4,0,7,16,3,4,0,8,0.0,0.0,7969


In [60]:
pd.to_numeric(df.Stay_In_Current_City_Years, errors='coerce')

0         2
1         2
2         2
3         2
4         4
5         3
6         2
7         2
8         2
9         1
10        1
11        1
12        1
13        1
14        1
15        1
16        1
17        1
18        1
19        4
20        4
21        4
22        4
23        4
24        4
25        0
26        0
27        0
28        0
29        4
         ..
537547    1
537548    1
537549    1
537550    3
537551    3
537552    3
537553    3
537554    3
537555    3
537556    1
537557    1
537558    1
537559    1
537560    1
537561    1
537562    1
537563    1
537564    1
537565    1
537566    1
537567    1
537568    1
537569    1
537570    1
537571    1
537572    1
537573    1
537574    1
537575    1
537576    1
Name: Stay_In_Current_City_Years, Length: 537577, dtype: int64

# Seperate data into training set and testing(validation) set

In [61]:
df_train = df[:50000]
df_train.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,1,10,1,2,0,3,0.0,0.0,8370
1,1,1,10,1,2,0,1,6.0,14.0,15200
2,1,1,10,1,2,0,12,0.0,0.0,1422
3,1,1,10,1,2,0,12,14.0,0.0,1057
4,0,7,16,3,4,0,8,0.0,0.0,7969


In [62]:
df_test = df[50000:]
df_test.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
50000,0,3,19,3,1,0,11,0.0,0.0,3226
50001,0,3,19,3,1,0,8,17.0,0.0,9763
50002,0,3,19,3,1,0,11,16.0,0.0,7587
50003,0,2,0,1,4,0,5,0.0,0.0,8681
50004,1,5,0,1,0,1,7,0.0,0.0,20495


seperate attributes and answer

In [63]:
y = df_train['Gender'].values
df_train = df_train.drop('Gender', 1)

y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

# Build The Decision Tree

In [64]:
dtree=DecisionTreeClassifier(max_depth=7)
dtree.fit(df_train,y)

dot_data = StringIO()
export_graphviz(dtree, 
                out_file=dot_data,  
                filled=True, 
                feature_names=list(df_train),
                class_names=['female','male'],
                special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("tree.pdf")

True

In [65]:
dtree.feature_importances_

array([0.17916334, 0.46733832, 0.11007384, 0.12440386, 0.05734934,
       0.0173674 , 0.03154971, 0.        , 0.01275418])

In [66]:
y_test = df_test['Gender'].values
X_test = df_test.drop('Gender', 1)

y_predict = dtree.predict(X_test)

y_predict

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [67]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.7708197884641811

# other

In [68]:
df1 = pd.read_csv('BlackFriday.csv', encoding = "ISO-8859-1")
df1.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [69]:
df1 = df1.drop('User_ID', axis = 1)
df1 = df1.drop('Product_ID', axis = 1)

In [70]:
df1.isnull().sum()

Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            166986
Product_Category_3            373299
Purchase                           0
dtype: int64

In [71]:
p2 = df1['Product_Category_2']
df1['Product_Category_2'].fillna('0',inplace = True)

In [72]:
p3 = df1['Product_Category_3']
df1['Product_Category_3'].fillna('0',inplace = True)

In [73]:
df1.isnull().sum()

Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [74]:
gender_map = {'M' : 0, 'F' :1}
df1['Gender'] = df1['Gender'].map(gender_map)
df1.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,0-17,10,A,2,0,3,0,0,8370
1,1,0-17,10,A,2,0,1,6,14,15200
2,1,0-17,10,A,2,0,12,0,0,1422
3,1,0-17,10,A,2,0,12,14,0,1057
4,0,55+,16,C,4+,0,8,0,0,7969


In [75]:
city_map = {'A' : 1, 'B' :2, 'C' :3}
df1['City_Category'] = df1['City_Category'].map(city_map)
df1.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,0-17,10,1,2,0,3,0,0,8370
1,1,0-17,10,1,2,0,1,6,14,15200
2,1,0-17,10,1,2,0,12,0,0,1422
3,1,0-17,10,1,2,0,12,14,0,1057
4,0,55+,16,3,4+,0,8,0,0,7969


In [76]:
age_map = {'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':7}
df1['Age'] = df1['Age'].map(age_map)
df1.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1,1,10,1,2,0,3,0,0,8370
1,1,1,10,1,2,0,1,6,14,15200
2,1,1,10,1,2,0,12,0,0,1422
3,1,1,10,1,2,0,12,14,0,1057
4,0,7,16,3,4+,0,8,0,0,7969


In [77]:
df1[['Product_Category_2','Product_Category_3']]= df1[['Product_Category_2','Product_Category_3']].apply(pd.to_numeric)

In [78]:
def mapping(x):
    if x>0:
        return 1
    else:
        return 0
df1['label_1']=df1['Product_Category_1'].apply(mapping)
df1['label_2']=df1['Product_Category_2'].apply(mapping)
df1['label_3']=df1['Product_Category_3'].apply(mapping)
df1.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,label_1,label_2,label_3
0,1,1,10,1,2,0,3,0.0,0.0,8370,1,0,0
1,1,1,10,1,2,0,1,6.0,14.0,15200,1,1,1
2,1,1,10,1,2,0,12,0.0,0.0,1422,1,0,0
3,1,1,10,1,2,0,12,14.0,0.0,1057,1,1,0
4,0,7,16,3,4+,0,8,0.0,0.0,7969,1,0,0


In [79]:
df1 = df1.drop('Product_Category_1', axis = 1)
df1 = df1.drop('Product_Category_2', axis = 1)
df1 = df1.drop('Product_Category_3', axis = 1)

In [80]:
df1['sum']=df1['label_1']+df1['label_2']+df1['label_3']

In [81]:
df1 = df1.drop('label_1', axis = 1)
df1 = df1.drop('label_2', axis = 1)
df1 = df1.drop('label_3', axis = 1)

In [82]:
year_map = {'0':0,'1':1,'2':2,'3':3,'4+':4}
df1['Stay_In_Current_City_Years'] = df1['Stay_In_Current_City_Years'].map(year_map)
df1.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Purchase,sum
0,1,1,10,1,2,0,8370,1
1,1,1,10,1,2,0,15200,3
2,1,1,10,1,2,0,1422,1
3,1,1,10,1,2,0,1057,2
4,0,7,16,3,4,0,7969,1


In [83]:
pd.to_numeric(df1.Stay_In_Current_City_Years, errors='coerce')

0         2
1         2
2         2
3         2
4         4
5         3
6         2
7         2
8         2
9         1
10        1
11        1
12        1
13        1
14        1
15        1
16        1
17        1
18        1
19        4
20        4
21        4
22        4
23        4
24        4
25        0
26        0
27        0
28        0
29        4
         ..
537547    1
537548    1
537549    1
537550    3
537551    3
537552    3
537553    3
537554    3
537555    3
537556    1
537557    1
537558    1
537559    1
537560    1
537561    1
537562    1
537563    1
537564    1
537565    1
537566    1
537567    1
537568    1
537569    1
537570    1
537571    1
537572    1
537573    1
537574    1
537575    1
537576    1
Name: Stay_In_Current_City_Years, Length: 537577, dtype: int64

In [84]:
df1_train = df1[:150000]
df1_train.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Purchase,sum
0,1,1,10,1,2,0,8370,1
1,1,1,10,1,2,0,15200,3
2,1,1,10,1,2,0,1422,1
3,1,1,10,1,2,0,1057,2
4,0,7,16,3,4,0,7969,1


In [85]:
df1_test = df1[150000:]
df1_test.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Purchase,sum
150000,0,3,1,2,1,1,15653,1
150001,0,3,1,2,1,1,8174,2
150002,0,3,1,2,1,1,9934,2
150003,0,3,1,2,1,1,8025,3
150004,0,3,1,2,1,1,15320,3


In [86]:
y = df1_train['sum'].values
df1_train = df1_train.drop('sum', 1)

y

array([1, 3, 1, ..., 3, 3, 3], dtype=int64)

In [87]:
dtree=DecisionTreeClassifier(max_depth=7)
dtree.fit(df1_train,y)

dot_data = StringIO()
export_graphviz(dtree, 
                out_file=dot_data,  
                filled=True, 
                feature_names=list(df1_train),
                class_names=['one kind of product','two kind of products','three kind of products'],
                special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("tree1.pdf")

True

In [88]:
dtree.feature_importances_

array([3.97869498e-04, 3.86394712e-03, 2.45310144e-03, 9.83133213e-05,
       8.27700210e-04, 2.63128813e-04, 9.92095940e-01])

In [89]:
y_test = df1_test['sum'].values
X_test = df1_test.drop('sum', 1)

y_predict = dtree.predict(X_test)

y_predict

array([3, 3, 1, ..., 1, 2, 2], dtype=int64)

In [90]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.5131806066923476