## We will be using one hot encoding method as a baseline

In [186]:
import pandas as pd
import numpy as np
import json
# example of a multi-label classification task
from sklearn.datasets import make_multilabel_classification

### Part 1: Preprocessing test and train data:
1) add label to train data
2) merge 'coauthor' and 'target' columns for test data
3) one hot encode keywords

In [231]:
train_df = pd.read_json('../data/train.json', orient='index')
test_df = pd.read_json('../data/test.json', orient='index')

train_json = json.load(open('../data/train.json'))
test_json = json.load(open('../data/train.json'))

In [400]:
test_df

Unnamed: 0,venue,keywords,year,author
0,[470],"[260, 6, 390, 136, 7, 11, 17, 285, 288, 162, 4...",[2017],[988]
1,[94],"[260, 454, 137, 14, 400, 274, 339, 213, 280, 2...",[2019],"[1001, 2123]"
2,[31],"[390, 198, 7, 461, 462, 14, 404, 277, 24, 473,...",[2014],[1578]
3,[6],"[195, 6, 390, 10, 459, 464, 338, 146, 276, 466...",[2010],"[1347, 2072]"
4,[162],"[64, 1, 260, 457, 73, 147, 282, 27, 156, 43, 3...",[2016],"[1107, 995]"
...,...,...,...,...
1995,[14],"[194, 260, 69, 73, 14, 462, 334, 17, 336, 280,...",[2015],[1876]
1996,[5],"[64, 260, 261, 135, 7, 75, 332, 334, 15, 463, ...",[2016],[1976]
1997,[58],"[451, 136, 459, 15, 146, 276, 342, 285, 222, 2...",[2004],"[646, 1131]"
1998,[6],"[128, 64, 322, 260, 261, 388, 391, 455, 265, 1...",[2016],"[1684, 1040, 1713, 2124]"


In [None]:
def preprocessing(df):
    df['venue'] = df.venue.replace('', 470).astype(int)

In [232]:
# add positive label
train_df['label'] = 1
train_df.head(3)

train_df['venue'] = train_df.venue.replace('', 470).astype(int)

In [233]:
# merge 'coauthor' and 'target' columns for test data

def add_column(coauthor, target):
    return coauthor + [target]
    
test_df['author'] = test_df.apply(lambda x: add_column(x.coauthor, x.target), axis=1)
test_df = test_df[['venue','keywords','year','author']]
test_df.head(3)

test_df['venue'] = test_df.venue.replace('', 470).astype(int)

In [234]:
def to_list(column):
    return [column]

train_df['venue'] = train_df.apply(lambda x: to_list(x.venue), axis=1)
test_df['venue'] = test_df.apply(lambda x: to_list(x.venue), axis=1)
train_df['year'] = train_df.apply(lambda x: to_list(x.year), axis=1)
test_df['year'] = test_df.apply(lambda x: to_list(x.year), axis=1)

In [235]:
# one hot encode keywords and author column

from sklearn.preprocessing import MultiLabelBinarizer

mlb_venue = MultiLabelBinarizer(sparse_output=True, classes=list(range(471)))
venue_train = mlb_venue.fit_transform(train_df.venue).toarray()
print(venue_train.shape)

mlb_keywords = MultiLabelBinarizer(sparse_output=True, classes=list(range(500)))
keywords_train = mlb_keywords.fit_transform(train_df.keywords).toarray()
print(keywords_train.shape)

mlb_year = MultiLabelBinarizer(sparse_output=True, classes=list(range(2000,2020)))
year_train = mlb_year.fit_transform(train_df.year).toarray()
print(year_train.shape)

mlb_author = MultiLabelBinarizer(sparse_output=True, classes=list(range(2302)))
author_train = mlb_author.fit_transform(train_df.author).toarray()
print(author_train.shape)

(26108, 471)
(26108, 500)
(26108, 20)
(26108, 2302)


In [236]:
X_train = np.hstack((venue_train, keywords_train, year_train, author_train)).astype('int')
y_train = train_df.label

print(X_train.shape)
print(y_train.shape)

(26108, 3293)
(26108,)


In [296]:
from scipy.spatial.distance import hamming
import random

random_index = random.sample(range(26108), 10000)
neg_sample_index = []
for i in random_index:
    hd_row = []
    for j in range(keywords_train.shape[0]):
        hd = hamming(keywords_train[i], keywords_train[j])
        hd_row.append(hd)
    for k in range(3):
        neg_index = hd_row.index(max(hd_row))
        neg_sample_index.append([i, neg_index])
        del hd_row[neg_index]
    

In [355]:
ia_list = [item[0] for item in neg_sample_index]
ib_list = [item[1] for item in neg_sample_index]

neg_result_df = train_df.iloc[ia_list].copy().reset_index(drop=True)
neg_result_df['author'] = train_df.iloc[ib_list].author.reset_index(drop=True)
neg_result_df['label'] = 0

In [357]:
# save hamming negative sample to csv
neg_result_df.to_csv('negative_sample_hm.csv',index=False)

In [361]:
# merge negative sample and train set

new_train_df = train_df.append(neg_result_df)
new_train_df.head()

Unnamed: 0,venue,keywords,year,author,label
0,[470],"[64, 1, 322, 134, 136, 396, 270, 144, 476, 481...",[2017],"[1605, 759]",1
1,[0],"[258, 260, 389, 261, 390, 396, 400, 17, 146, 2...",[2013],[2182],1
2,[1],"[320, 454, 266, 462, 17, 339, 404, 342, 407, 2...",[2007],[2176],1
3,[2],"[260, 132, 333, 15, 400, 272, 146, 401, 278, 3...",[2013],[1107],1
4,[3],"[64, 385, 449, 450, 71, 73, 268, 80, 216, 25, ...",[2009],[1414],1


In [364]:
# one-hot new train_set

new_venue_train = mlb_venue.transform(new_train_df.venue).toarray()
new_keywords_train = mlb_keywords.transform(new_train_df.keywords).toarray()
new_year_train = mlb_year.transform(new_train_df.year).toarray()
new_author_train = mlb_author.transform(new_train_df.author).toarray()

print(new_venue_train.shape)
print(new_keywords_train.shape)
print(new_year_train.shape)
print(new_author_train.shape)

new_X_train = np.hstack((new_venue_train, new_keywords_train, new_year_train, new_author_train)).astype('int')
new_y_train = new_train_df.label

print(new_X_train.shape)
print(new_y_train.shape)

(56108, 471)
(56108, 500)
(56108, 20)
(56108, 2302)
(56108, 3293)
(56108,)


In [395]:
from sklearn.model_selection import train_test_split
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(new_X_train, new_y_train, test_size=0.2, random_state=42)

In [365]:
venue_test = mlb_venue.transform(test_df.venue).toarray()
keywords_test = mlb_keywords.transform(test_df.keywords).toarray()
year_test = mlb_year.transform(test_df.year).toarray()
author_test = mlb_author.transform(test_df.author).toarray()

X_test = np.hstack((venue_test, keywords_test, year_test, author_test)).astype('int')

print(X_test.shape)

(2000, 3293)


In [399]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(random_state=42, max_iter = 10000)
clf.fit(X_train_a, y_train_a)

roc_auc_score(y_test_a, clf.predict_proba(X_test_a)[:, 1])

0.9986059566286253

In [386]:
result_prob = clf.predict_proba(X_test)

In [372]:
prob = []
c = 0
for i in X_test:
    prob.append(result[c][i-1])
    c+=1

In [394]:
predicted = [item[1] for item in result_prob]
ids = list(range(2000))
submission_df = pd.DataFrame({'Id':ids, 'predicted':predicted})

submission_df.head()

Unnamed: 0,Id,predicted
0,0,0.997977
1,1,0.994896
2,2,0.999017
3,3,0.998593
4,4,0.997874


In [None]:
submission_df.to_csv('submission_tom.csv', index=False)