In [45]:
import numpy as np; np.random.seed(1040941203) # For reproducibility (+82-10-4094-1203)
import pandas as pd

In [46]:
"""
    Step 3. Construct a classifier
"""
# Load manipulated data set
train_df = pd.read_pickle("train_set.csv")
test_df  = pd.read_pickle("test_set.csv")

In [47]:
# Function to make labels in the data frame into a list (i.e. 0 8 => [0, 8])
def labels_to_list(labels): return list(map(int, labels.split()))

# Process train & test set into an array format
X_train = np.array([x for x in train_df['features']])
Y_train = np.array([labels_to_list(y) for y in train_df['labels']])
X_test = np.array([x for x in test_df['features']])

# Check shape of array-format train & test set
print("X_train: ", X_train.shape)
print("Y_train: ", Y_train.shape)
print("X_test: ", X_test.shape)

X_train:  (1996, 2048)
Y_train:  (1996,)
X_test:  (10000, 2048)


In [48]:
# Load packages for splitting train & validation set, XGBoost classifier, 1-of-K encoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier

# Package for estimating a time taken
import time; t=time.time()

In [49]:
# Convert list of labels to follow 1-of-K coding scheme
one_of_K_encoder = MultiLabelBinarizer()
Y_train_ = one_of_K_encoder.fit_transform(Y_train)

# Split train set into 8:2 (train : validation)
random_state = np.random.RandomState(1040941203)
X_train_, X_test_, Y_train_, Y_test_ = train_test_split(X_train, Y_train_, test_size=0.2, random_state=random_state)

# Train the XGBoost classifier
classifier = OneVsRestClassifier(XGBClassifier(num_class=9, gamma=0.024, learning_rate=0.3, max_depth=6, nthread=4, n_estimators=1000, objective="multi:softmax"))
classifier.fit(X_train_, Y_train_)

# Predict labels using the trained model
Y_predict = classifier.predict(X_test_)

# Show spent time
print("Time passed: ", "{0:.3f}".format(time.time() - t), "sec")

Time passed:  4373.469 sec


In [50]:
# Show some predicted values
print("Samples of predicted labels (in 1-of-K coding scheme):\n", Y_predict[1:3])
print("\nSamples of corresponding predicted labels:\n", one_of_K_encoder.inverse_transform(Y_predict[1:3]))

Samples of predicted labels (in binary matrix):
 [[0 1 1 0 0 1 1 0 0]
 [0 0 0 0 0 0 1 0 1]]

Samples of predicted labels:
 [(1, 2, 5, 6), (6, 8)]


In [51]:
# Construct a data frame to show ratio of each label in a predicted set
stat = pd.DataFrame(columns = ['label ' + str(i) for i in range(9)] + ['total_biz'], index = ['biz_count', 'biz_percentage'])

stat.loc['biz_count'] = np.append(np.sum(Y_predict, axis=0), len(Y_predict))
stat.loc['biz_percentage'] = stat.loc['biz_count'] * 100 / len(Y_predict) 

pd.options.display.float_format = '{:.0f}%'.format

# Show the statistics
stat

Unnamed: 0,label 0,label 1,label 2,label 3,label 4,label 5,label 6,label 7,label 8,total_biz
biz_count,103,202,221,207,114,264,287,121,236,400
biz_percentage,26%,50%,55%,52%,28%,66%,72%,30%,59%,100%


In [52]:
from sklearn.metrics import f1_score # For measuring F1 score metrics

# Show global F1 score & on-label F1 score
print("Overall F1 score: ", f1_score(Y_test_, Y_predict, average='micro')) 
print("F1 score of each label : ", f1_score(Y_test, y_ppredict, average=None))

F1 score:  0.8349900596421471
Individual Class F1 score:  [0.66666667 0.83538084 0.88735632 0.66666667 0.77118644 0.88593156
 0.95017794 0.79518072 0.875     ]


In [53]:
t = time.time()

# Convert list of labels to follow 1-of-K coding scheme
one_of_K_encoder = MultiLabelBinarizer()
Y_train_ = one_of_K_encoder.fit_transform(Y_train)

# Train the SVM classifier again with a full train set
random_state = np.random.RandomState(0)
classifier = OneVsRestClassifier(XGBClassifier(num_class=9, gamma=0.024, learning_rate=0.3, max_depth=6, nthread=4, n_estimators=1000, objective="multi:softmax"))
classifier.fit(X_train, Y_train_)

Y_predict = classifier.predict(X_test)
Y_predict_label = one_of_K_encoder.inverse_transform(Y_predict)

print("Time passed: ", "{0:.1f}".format(time.time() - t), "sec")

Time passed:  5864.1 sec


In [54]:
# Construct a data frame to show ratio of each label in a predicted set
stat = pd.DataFrame(columns=['label ' + str(i) for i in range(9)] + ['total_biz'], index = ['biz_count', 'biz_percentage'])

stat.loc['biz_count'] = np.append(np.sum(Y_predict, axis=0), len(Y_predict))
stat.loc['biz_percentage'] = stat.loc["biz_count"] * 100 / len(Y_predict)

pd.options.display.float_format = '{:.0f}%'.format

stat

Unnamed: 0,label 0,label 1,label 2,label 3,label 4,label 5,label 6,label 7,label 8,total_biz
biz_count,760,7663,8446,7482,1540,9084,9344,2097,6547,10000
biz_percentage,8%,77%,84%,75%,15%,91%,93%,21%,65%,100%


In [55]:
# Construct a data frame for submission (matching predicted label with business id in a test set)
final_df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(test_df)):
    biz = test_df.loc[i]['business_id']
    label = Y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    
    final_df.loc[i] = [str(biz), label]

# Write a submission file
with open("submission_Seokju_Hahn_XGB.csv",'w') as file:
    final_df.to_csv(file, index=False)