In [25]:
import pandas as pd
import numpy as np
import math
from sklearn.svm import LinearSVC,SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import ComplementNB,MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import BinaryRelevance,ClassifierChain,LabelPowerset

In [26]:
traindf = pd.read_csv('./train.csv')
traindf['abstract'] = traindf['abstract'].apply(lambda abstract: abstract.strip('[]').split(', '))
traindf['title'] = traindf['title'].apply(lambda title: title.strip('[]').split(', '))
traindf['authors'] = traindf['authors'].apply(lambda authors: authors.strip('[]').split(', '))

Combine title and abstract into a string for each article:

In [27]:
traindf['combined_text'] = (traindf.title+traindf.abstract).apply(lambda text: ' '.join(text))
    

Target Labels: (101 labels)

In [28]:
def targetLabels(authorData):
    labels = np.zeros((len(authorData),101))
    for i, article in enumerate(authorData):
        prolific = False
        for j, author in enumerate(article):
            author = int(author)
            if author < 100:
                prolific = True
                labels[i][author] = 1
        if not prolific:
            labels[i][100] = 1
    return labels

def createTargetColumns(traindf):
    targets = targetLabels(traindf['authors'])
    for label in range(101):
        traindf[str(label)] = [targets[article, label] for article in range(traindf.shape[0])]
    return traindf

In [29]:
trainLabels = targetLabels(traindf['authors'])

Identifying prolific articles:

In [30]:
train_prolific_ids = np.array([article_id for article_id in range(trainLabels.shape[0]) 
                               if np.sum(trainLabels[article_id][:100]) > 0])
print(train_prolific_ids)

[    0     1     3 ... 25778 25781 25788]


Identifying non-prolific articles:

In [31]:
train_nonprolific_ids = np.array([article_id for article_id in range(trainLabels.shape[0]) 
                               if np.sum(trainLabels[article_id][:100]) == 0])
print(train_nonprolific_ids)

[    2     5     6 ... 25790 25791 25792]


Create training sample with 25% non-prolific articles: (includes all prolific articles)

In [36]:
rnd = np.random.RandomState(10)

nonprolific_prop = 0.17
n_prolific = train_prolific_ids.shape[0]
n_articles = traindf.shape[0]
n_nonprolific_samples = math.floor(nonprolific_prop/(1-nonprolific_prop)*n_prolific)

train_nonprolific_sample_ids = rnd.choice(train_nonprolific_ids, size = n_nonprolific_samples, replace = False)

print(len(train_nonprolific_sample_ids))

1527


In [37]:
training_sample_ids = np.concatenate((train_prolific_ids, train_nonprolific_sample_ids), axis=0)

training_sample = traindf.iloc[training_sample_ids]

Sample labels:

In [38]:
sampleLabels = targetLabels(training_sample['authors'])

Split data into train and test sets:

In [39]:
trainData, testData, labelsTrain, labelsTest = train_test_split(training_sample, sampleLabels, 
                                                                        test_size=0.25, random_state=42)

Extracting coauthor and venue features:

In [40]:
def coauthorsFeatures(authorData):
    coauthors = np.zeros((len(authorData), 21246),dtype=np.int8)
    for i, article in enumerate(authorData):
        for j, author in enumerate(article):
            try:
                author = int(author)
                if author > 99:
                    coauthors[i][author] = 1
            except:
                pass
    return coauthors

def venueFeatures(venueData):
    venues = np.zeros((len(venueData), 465), dtype=np.int8)
    for i, venue in enumerate(venueData):
        if not np.isnan(venue):
            venues[i][int(venue)] = 1
    return venues

In [41]:
trainCoauthors = coauthorsFeatures(trainData['authors'])
trainVenues = venueFeatures(trainData['venue'])

testCoauthors = coauthorsFeatures(testData['authors'])
testVenues = venueFeatures(testData['venue'])

Implementing Tf-Idf and obtaining features for text in trainData articles:

In [42]:
tfidf = TfidfVectorizer()
trainTextfeatures = tfidf.fit_transform(trainData['combined_text']).toarray()

testTextfeatures = tfidf.transform(testData['combined_text']).toarray()

train_abstract_len = np.zeros(len(trainData))
test_abstract_len = np.zeros(len(testData))

train_title_len = np.zeros(len(trainData))
test_title_len = np.zeros(len(testData))

train_combined = np.zeros(len(trainData))
test_combined = np.zeros(len(testData))

count = 0

for i in trainData['abstract'].index:
    train_abstract_len[count] = len(trainData['abstract'][i])
    train_title_len[count] = len(trainData['title'][i])
    train_combined[count] = (train_title_len[count] + train_abstract_len[count])
    count += 1

count = 0
for i in testData['abstract'].index:
    test_abstract_len[count] = len(testData['abstract'][i])
    test_title_len[count] = len(testData['title'][i])
    test_combined[count] = (test_title_len[count] + test_abstract_len[count])
    count += 1

Concatenate features:

In [43]:
trainFeatures = np.concatenate((trainCoauthors, trainVenues, trainTextfeatures), axis=1)

testFeatures = np.concatenate((testCoauthors, testVenues, testTextfeatures), axis=1)

## Binary Relevance with Multinomial NB:

In [44]:
model = BinaryRelevance(MultinomialNB())
model.fit(trainFeatures,labelsTrain)
test_preds = model.predict(testFeatures)
acc = accuracy_score(labelsTest, test_preds)
f1 = f1_score(labelsTest, test_preds, average="samples")
result = {"accuracy:": acc, "f1_score": f1}
print(result)

{'accuracy:': 0.007120605251446373, 'f1_score': 0.008386321188084668}


## Binary Relevance with Complement NB:

In [45]:
model = BinaryRelevance(ComplementNB())
model.fit(trainFeatures,labelsTrain)
test_preds = model.predict(testFeatures)
acc = accuracy_score(labelsTest, test_preds)
f1 = f1_score(labelsTest, test_preds, average="samples")
result = {"accuracy:": acc, "f1_score": f1}
print(result)

{'accuracy:': 0.027147307521139297, 'f1_score': 0.029005420305402724}


## Classifier Chain with Multinomial NB:

In [162]:
model = ClassifierChain(MultinomialNB())
model.fit(trainFeatures,labelsTrain)
test_preds = model.predict(testFeatures)
acc = accuracy_score(labelsTest, test_preds)
f1 = f1_score(labelsTest, test_preds, average="samples")
result = {"accuracy:": acc, "f1_score": f1}
print(result)

{'accuracy:': 0.057096903900281465, 'f1_score': 0.05768663718000267}


## Classifier Chain with Complement NB:

In [19]:
model = ClassifierChain(ComplementNB())
model.fit(trainFeatures,labelsTrain)
test_preds = model.predict(testFeatures)
acc = accuracy_score(labelsTest, test_preds)
f1 = f1_score(labelsTest, test_preds, average="samples")
result = {"accuracy:": acc, "f1_score": f1}
print(result)

{'accuracy:': 0.11137917169280258, 'f1_score': 0.11318126987608292}


## Label Powerset with Multinomial NB:

In [20]:
model = LabelPowerset(MultinomialNB())
model.fit(trainFeatures,labelsTrain)
test_preds = model.predict(testFeatures)
acc = accuracy_score(labelsTest, test_preds)
f1 = f1_score(labelsTest, test_preds, average="samples")
result = {"accuracy:": acc, "f1_score": f1}
print(result)

{'accuracy:': 0.24567752312022517, 'f1_score': 0.24567752312022517}


## Label Powerset with Complement NB:

In [21]:
model = LabelPowerset(ComplementNB())
model.fit(trainFeatures,labelsTrain)
test_preds = model.predict(testFeatures)
acc = accuracy_score(labelsTest, test_preds)
f1 = f1_score(labelsTest, test_preds, average="samples")
result = {"accuracy:": acc, "f1_score": f1}
print(result)

{'accuracy:': 0.6328910333735425, 'f1_score': 0.6806479407203171}


## Predict test set using Label Powerset with Complement NB:

### Model using full training data sample

Features and Tf-Idf for training sample: (Sample labels were calculated earlier: 'sampleLabels')

In [46]:
tfidf = TfidfVectorizer()
sampleTextfeatures = tfidf.fit_transform(training_sample['combined_text']).toarray()

sampleCoauthors = coauthorsFeatures(training_sample['authors'])
sampleVenues = venueFeatures(training_sample['venue'])

abstract_len = np.zeros(len(training_sample))
title_len = np.zeros(len(training_sample))
count = 0

for i in training_sample['abstract'].index:
    abstract_len[count] = len(training_sample['abstract'][i])
    title_len[count] = len(training_sample['title'][i])
    count += 1

abstract_len = pd.Categorical(abstract_len, categories=range(3000))
title_len = pd.Categorical(title_len, categories=range(110))

abstract_len = pd.get_dummies(abstract_len)
title_len = pd.get_dummies(title_len)



#sampleFeatures = np.concatenate((sampleCoauthors, sampleVenues, sampleTextfeatures,abstract_len,title_len), axis=1)
sampleFeatures = np.concatenate((sampleCoauthors, sampleVenues, sampleTextfeatures), axis=1)

Fitting model:

In [47]:
fullmodel = LabelPowerset(LinearSVC())

fullmodel.fit(sampleFeatures,sampleLabels)

LabelPowerset(classifier=LinearSVC(), require_dense=[True, True])

Training metrics:

In [48]:
fullmodel_predictions = fullmodel.predict(sampleFeatures)
acc = accuracy_score(sampleLabels, fullmodel_predictions)
f1 = f1_score(sampleLabels, fullmodel_predictions, average="samples")
result = {"accuracy:": acc, "f1_score": f1}
print(result)

{'accuracy:': 1.0, 'f1_score': 1.0}


Test Data:

In [20]:
testdf = pd.read_csv('test.csv')
testdf['abstract'] = testdf['abstract'].apply(lambda abstract: abstract.strip('[]').split(', '))
testdf['title'] = testdf['title'].apply(lambda title: title.strip('[]').split(', '))
testdf['coauthors'] = testdf['coauthors'].apply(lambda authors: authors.strip('[]').split(', '))

Test Features:

In [21]:
testdf['combined_text'] = (testdf.title+testdf.abstract).apply(lambda text: ' '.join(text))
testTextfeatures = tfidf.transform(testdf['combined_text']).toarray()

testCoauthors = coauthorsFeatures(testdf['coauthors'])
testVenues = venueFeatures(testdf['venue'])

test_abstract_len = np.zeros(len(testdf))
test_title_len = np.zeros(len(testdf))

for i in range(len(testdf)):
    test_abstract_len[i] = len(testdf.loc[i]['abstract'])
    test_title_len[i] = len(testdf.loc[i]['title'])

    
test_abstract_len = pd.Categorical(test_abstract_len, categories=range(3000))
test_title_len = pd.Categorical(test_title_len, categories=range(110))

test_abstract_len = pd.get_dummies(test_abstract_len)
test_title_len = pd.get_dummies(test_title_len)

#testFeatures = np.concatenate((testCoauthors, testVenues, testTextfeatures,test_abstract_len,test_title_len), axis=1)
testFeatures = np.concatenate((testCoauthors, testVenues, testTextfeatures), axis=1)

## Test predictions:

In [22]:
test_predictions = fullmodel.predict(testFeatures)

Create submission:

In [23]:
def createSubmission(predictions, testdf, filename):
    predDf = pd.DataFrame(data={'Id': [i for i in testdf['identifier']], 'Predict': [None for _ in range(testdf.shape[0])]})
    for i in range(testdf.shape[0]):
        if predictions[i,100] == 1:
            predDf['Predict'][i] = '-1'
            continue
        pred_str = ''
        for j in range(100):
            if predictions[i,j] == 1:
                pred_str = ' '.join((pred_str, str(j))).strip()
        if len(pred_str) == 0:
            pred_str = '-1'
        predDf['Predict'][i] = pred_str
    predDf.to_csv(filename, sep=',', index=False)
    print(f'Saved predictions to {filename}')
    return predDf                

In [24]:
createSubmission(test_predictions, testdf, 'Submission_LinearSVC.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predDf['Predict'][i] = pred_str
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predDf['Predict'][i] = '-1'


Saved predictions to Submission_SGD.csv


Unnamed: 0,Id,Predict
0,0,92
1,1,-1
2,2,31
3,3,23
4,4,32
...,...,...
795,795,54
796,796,97
797,797,13
798,798,71
