In [1]:
import pandas as pd
import numpy as np
import gensim
import Cython
import json
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [2]:
df_train = json.load(open(r"C:\Users\22794\Desktop\SML A2\SMLproject2\data\backup\train.json"))

In [3]:
labels = []
data = []
d2v = gensim.models.doc2vec.Doc2Vec.load(r'doc2vec.model')
n2v = gensim.models.Word2Vec.load(r'node2vec_1024.model')

for i, record in enumerate(df_train):        
    proauthors = [i for i in record['authors'] if i < 100]
    label = np.zeros(100)
    if len(proauthors) > 0:
        label[proauthors] = 1.
    else: 
        label[-1] = 1.
    labels.append(label)
    
    text = [str(i) for i in record['title']]
    text.extend([str(i) for i in record['abstract']])
    text = d2v.infer_vector(text)
    
    coauthors = [i for i in record['authors'] if i >= 100]
    coauthor_vec = []
    if len(coauthors) > 0:
        for co in coauthors:
            coauthor_vec.append(n2v.wv[str(co)])
        coauthor_vec = np.mean(np.array(coauthor_vec), axis=0)
    else:
        coauthor_vec = np.zeros(1024)
    
    venue_vec = np.zeros(465)
    venue_vec[[record['venue']] if record['venue'] != '' else []] = 1.
    
    data.append(np.concatenate([text, coauthor_vec, venue_vec], axis=0))
    
labels = np.array(labels)
data = np.array(data)

In [4]:
ids = np.where(labels[:, -1]==0)
data_t = data[ids]
labels_t = labels[ids]

ids = np.where(labels[:, -1]==1)
data_f = data[ids]
labels_f = labels[ids]

data_t, labels_t = resample(data_t, labels_t, replace=True, n_samples=int(labels_f.shape[0]), random_state=51)

data = np.concatenate([data_t, data_f])
labels = np.concatenate([labels_t, labels_f])

In [5]:
data_train, data_valid, labels_train, labels_valid = train_test_split(data, labels, test_size=0.1, random_state=51)

In [6]:
data_train.shape

(33103, 1745)

In [7]:
labels_train.shape

(33103, 100)

### Multi-label Support Vector Machine

In [8]:
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.multioutput import MultiOutputClassifier

svm = SVC()
multilabel_classifier = MultiOutputClassifier(svm, n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(data_train, labels_train)

In [9]:
import pickle
filename = 'svm.sav'
pickle.dump(svm, open(filename, 'wb'))

### SGD

In [7]:
from sklearn.linear_model import SGDClassifier
from sklearn.multioutput import MultiOutputClassifier

sgd = SGDClassifier(max_iter=1000, tol=1e-3)
multilabel_classifier = MultiOutputClassifier(sgd, n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(data_train, labels_train)

### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

lr = LogisticRegression(multi_class='multinomial', random_state=51)
multilabel_classifier = MultiOutputClassifier(lr, n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(data_train, labels_train)

In [10]:
train_predictions = multilabel_classifier.predict(data_train)

f1_score(labels_train, train_predictions, average='samples')

0.893830511692941

In [11]:
predictions = multilabel_classifier.predict(data_valid)

f1_score(labels_valid, predictions, average='samples')

0.859310889346225

In [12]:
df_test = pd.read_json(r"C:\Users\22794\Desktop\SML A2\SMLproject2\data\downsamp\test.json")

pred_ids = []
for i in df_test['identifier']:
    pred_ids.append(i)
pred_ids = np.array(pred_ids)

data_test = []
for _, row in df_test.iterrows():
    text = [str(i) for i in row['title']]
    text.extend([str(i) for i in row['abstract']])
    text = d2v.infer_vector(text)
    
    coauthors = [i for i in row['coauthors'] if i >= 100]
    coauthor_vec = []
    if len(coauthors) > 0:
        for co in coauthors:
            coauthor_vec.append(n2v.wv[str(co)])
        coauthor_vec = np.mean(np.array(coauthor_vec), axis=0)
    else:
        coauthor_vec = np.zeros(1024)
    
    venue_vec = np.zeros(465)
    venue_vec[[record['venue']] if row['venue'] != '' else []] = 1.
    
    data_test.append(np.concatenate([text, coauthor_vec, venue_vec], axis=0))
    
data_test = np.array(data_test)

In [13]:
test_pred = multilabel_classifier.predict(data_test)

In [14]:
test_labels = []
for each in test_pred:
    if len(np.where(each == 1.)[0].tolist()) == 0:
        test_labels.append("-1")
    elif len(" ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]).split(" ")) == 1 and " ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]).split(" ")[-1] == "99":
        test_labels.append("-1")
    elif len(" ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]).split(" ")) == 1 and " ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]).split(" ")[-1] != "99":
        test_labels.append(" ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]))
    elif len(" ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]).split(" ")) != 1 and " ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]).split(" ")[-1] != "99":
        test_labels.append(" ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]))
    elif len(" ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]).split(" ")) != 1 and " ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()]).split(" ")[-1] == "99":
        test_labels.append(" ".join([str(int(i)) for i in np.where(each == 1.)[0].tolist()[:-1]]))

In [15]:
test_labels

['92',
 '-1',
 '31',
 '23',
 '-1',
 '-1',
 '37',
 '13',
 '-1',
 '-1',
 '-1',
 '42',
 '-1',
 '-1',
 '-1',
 '82',
 '84',
 '-1',
 '37',
 '-1',
 '83',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '35',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '52 65',
 '84',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '54',
 '26',
 '-1',
 '6 73',
 '-1',
 '42',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '51',
 '-1',
 '29',
 '-1',
 '-1',
 '-1',
 '-1',
 '31 95',
 '-1',
 '10',
 '9 30',
 '-1',
 '59',
 '16',
 '85 96',
 '-1',
 '-1',
 '-1',
 '80',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '20',
 '10',
 '-1',
 '41',
 '37',
 '-1',
 '-1',
 '-1',
 '52 65',
 '37',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '67',
 '73',
 '-1',
 '-1',
 '-1',
 '-1',
 '6',
 '77',
 '65',
 '75',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '27',
 '25',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '3',
 '96',
 '-1',
 '-1',
 '89',
 '-1',
 '25',
 '-1',
 '92',
 '-1',
 '-1',
 '-1',
 '25',
 '-1',
 '-1',

In [16]:
output = pd.DataFrame({'ID': range(800), 'Predict':test_labels})
output.to_csv('pred.csv', sep=',', index=False, encoding='utf-8')