In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import re

In [2]:
train_data = pd.read_csv(r"C:\Users\TEJKIRAN\Desktop\DataAnalytics_files\tag_prediction\Train.csv") 
test_data = pd.read_csv(r"C:/Users/TEJKIRAN/Desktop/DataAnalytics_files/tag_prediction/Test.csv")

In [3]:
print("Train size:", train_data.shape)
print("Test size:", test_data.shape)

Train size: (14004, 31)
Test size: (6002, 6)


In [4]:
test_data.head()

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics
0,9409,fundamental frequency (f0) approximation from ...,0,0,0,1
1,17934,"this large-scale study, consisting of 24.5 mil...",1,0,0,1
2,16071,we present a stability analysis of the plane c...,0,0,1,0
3,16870,we construct finite time blow-up solutions to ...,0,1,0,0
4,10496,planetary nebulae (pne) constitute an importan...,0,0,1,0


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14004 entries, 0 to 14003
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   id                                            14004 non-null  int64 
 1   ABSTRACT                                      14004 non-null  object
 2   Computer Science                              14004 non-null  int64 
 3   Mathematics                                   14004 non-null  int64 
 4   Physics                                       14004 non-null  int64 
 5   Statistics                                    14004 non-null  int64 
 6   Analysis of PDEs                              14004 non-null  int64 
 7   Applications                                  14004 non-null  int64 
 8   Artificial Intelligence                       14004 non-null  int64 
 9   Astrophysics of Galaxies                      14004 non-null  int64 
 10

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6002 entries, 0 to 6001
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                6002 non-null   int64 
 1   ABSTRACT          6002 non-null   object
 2   Computer Science  6002 non-null   int64 
 3   Mathematics       6002 non-null   int64 
 4   Physics           6002 non-null   int64 
 5   Statistics        6002 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 281.5+ KB


In [7]:
train_data.shape[0]

14004

In [8]:
topic_cols = ['Computer Science', 'Mathematics', 'Physics', 'Statistics']
target_cols = ['Analysis of PDEs', 'Applications','Artificial Intelligence', 'Astrophysics of Galaxies','Computation and Language', 'Computer Vision and Pattern Recognition','Cosmology and Nongalactic Astrophysics','Data Structures and Algorithms', 'Differential Geometry','Earth and Planetary Astrophysics', 'Fluid Dynamics','Information Theory', 'Instrumentation and Methods for Astrophysics','Machine Learning', 'Materials Science', 'Methodology', 'Number Theory','Optimization and Control', 'Representation Theory', 'Robotics','Social and Information Networks', 'Statistics Theory','Strongly Correlated Electrons', 'Superconductivity','Systems and Control']

In [9]:
my_list = []
for i in range(train_data.shape[0]):
    my_list.append(sum(train_data.iloc[i, 6:]))
pd.Series(my_list).value_counts()

1    9757
2    3744
3     465
4      38
dtype: int64

<b>De-contraction → Removing special chars → Removing stopwords →Stemming</b>

In [10]:
def decontracted(phrase):
  #specific
  phrase = re.sub(r"won't", "will not", phrase)
  phrase = re.sub(r"can't", "cannot", phrase)
  # general
  phrase = re.sub(r"n't", " not", phrase)
  phrase = re.sub(r"'re", " are", phrase)
  phrase = re.sub(r"'s", " is", phrase)
  phrase = re.sub(r"'d", " would", phrase)
  phrase = re.sub(r"'ll","will", phrase)
  phrase = re.sub(r"'t", " not", phrase)
  phrase = re.sub(r"'ve", " have", phrase)
  phrase = re.sub(r"'m", " am", phrase)
  phrase = re.sub(r"'em" ," them" ,phrase)
  return phrase

from wordcloud import WordCloud, STOPWORDS
stopwords = set(list(STOPWORDS))

def stemming(sentence):
  token_words = word_tokenize(sentence)
  stem_sentence = []
  for word in token_words:
    stemmer = PorterStemmer()
    stem_sentence.append(stemmer.stem(word))
    stem_sentence.append(" ")
  return "".join(stem_sentence)

def text_preprocessing(text):
  preprocessed_abstract = []
  for sentence in text:
    sent = decontracted(sentence) 
    sent = re.sub('[^A-Za-z0–9]+', ' ', sent)
    sent = ' '.join(e.lower() for e in sent.split() if e.lower() not in stopwords) 
    sent = stemming(sent)
    preprocessed_abstract.append(sent.strip())
  return preprocessed_abstract

In [11]:
train_data['preprocessed_abstract'] = text_preprocessing(train_data['ABSTRACT'].values)
train_data[['ABSTRACT', 'preprocessed_abstract']].head()

Unnamed: 0,ABSTRACT,preprocessed_abstract
0,a ever-growing datasets inside observational a...,grow dataset insid observ astronomi challeng s...
1,we propose the framework considering optimal $...,propos framework consid optim t match exclud p...
2,nanostructures with open shell transition meta...,nanostructur open shell transit metal molecula...
3,stars are self-gravitating fluids inside which...,star self gravit fluid insid pressur buoyanc r...
4,deep neural perception and control networks ar...,deep neural percept control network like key c...


In [12]:
X = train_data[['Computer Science', 'Mathematics', 'Physics', 'Statistics', 'preprocessed_abstract']]
y = train_data[target_cols]

from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size = 0.25, random_state = 21)
print(X_train.shape, y_train.shape)
print(X_cv.shape, y_cv.shape)

(10503, 5) (10503, 25)
(3501, 5) (3501, 25)


There are 10503 datapoints in our training set and ~3501 datapoints in the validation set.

In [13]:
vocab = list(train_data['preprocessed_abstract']) 
vocab

['grow dataset insid observ astronomi challeng scientist insid mani aspect includ effici interact data explor visual mani tool develop confront challeng usual focu display actual imag focu visual pattern within catalog insid predefin way insid paper introduc vizic python visual librari build connect imag catalog interact map sky region vizic visual catalog data custom background canva help shape size orient object insid catalog display object insid map highli interact customiz compar insid imag object filter color properti redshift magnitud sub select help lasso tool consid analysi help standard python function jupyt notebook furthermor vizic allow custom overlay append dynam top sky map initi implement sever overlay name voronoi delaunay minimum span tree healpix grid layer help consid visual larg scale structur overlay gener ad remov interact one line code catalog data store insid non relat databas interfac develop insid javascript python work within jupyt notebook allow creat custom

In [14]:
vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.5, sublinear_tf = True, ngram_range = (1, 1))
vectorizer.fit(vocab)

TfidfVectorizer(max_df=0.5, min_df=5, sublinear_tf=True)

In [15]:
X_train_tfidf = vectorizer.transform(X_train['preprocessed_abstract'])

X_cv_tfidf = vectorizer.transform(X_cv['preprocessed_abstract'])

In [16]:
print(X_train_tfidf.shape, y_train.shape)
print(X_cv_tfidf.shape, y_cv.shape)

(10503, 7347) (10503, 25)
(3501, 7347) (3501, 25)


In [17]:
from scipy.sparse import hstack
X_train_data_tfidf = hstack((X_train[topic_cols], X_train_tfidf))
X_cv_data_tfidf = hstack((X_cv[topic_cols], X_cv_tfidf))

As we know, this is a multi-label classification problem and each document may have one or more predefined tags simultaneously. We already saw that several datapoints have 2 or 3 tags.

Most traditional machine learning algorithms are developed for single-label classification problems. Therefore a lot of approaches in the literature transform the multi-label problem into multiple single-label problems so that the existing single-label algorithms can be used.

One such technique is One-vs-the-rest (OvR) multiclass/multilabel classifier, also known as one-vs-all. In OneVsRest Classifier, we fit one classifier per class and it is the most commonly used strategy for multiclass/multi-label classification and is a fair default choice. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency, one advantage of this approach is its interpretability. Since each class is represented by one and one classifier only, it is possible to gain knowledge about the class by inspecting its corresponding classifier.

In [18]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
C_range = [0.01, 0.1, 1, 10, 100] 
for i in C_range:
  clf = OneVsRestClassifier(LogisticRegression(C = i, solver =  'sag'))
  clf.fit(X_train_data_tfidf, y_train)
  y_pred_train = clf.predict(X_train_data_tfidf)
  y_pred_cv = clf.predict(X_cv_data_tfidf)
  f1_score_train = f1_score(y_train, y_pred_train, average = 'micro')
  f1_score_cv = f1_score(y_cv, y_pred_cv, average = 'micro')
  print("C:", i, "Train Score:",f1_score_train, "CV Score:", f1_score_cv)
print("- "*50)

C: 0.01 Train Score: 0.2433689807262401 CV Score: 0.24125495852866932
C: 0.1 Train Score: 0.3252527573529412 CV Score: 0.31795575373006346




C: 1 Train Score: 0.7718917112299465 CV Score: 0.678254051228437




C: 10 Train Score: 0.9699750695523359 CV Score: 0.7475172333216498




C: 100 Train Score: 0.999857782834388 CV Score: 0.7412682265174636
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 


We can see that the highest validation score is obtained at C = 10. But the training score here is also very high, which was kind of expected.

In [19]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
C_range = [10, 20, 40, 70, 100] 
for i in C_range:
  clf = OneVsRestClassifier(LogisticRegression(C = i, solver =  'sag'))
  clf.fit(X_train_data_tfidf, y_train)
  y_pred_train = clf.predict(X_train_data_tfidf)
  y_pred_cv = clf.predict(X_cv_data_tfidf)
  f1_score_train = f1_score(y_train, y_pred_train, average = 'micro')
  f1_score_cv = f1_score(y_cv, y_pred_cv, average = 'micro')
  print("C:", i, "Train Score:",f1_score_train, "CV Score:", f1_score_cv)
print("- "*50)



C: 10 Train Score: 0.9699750695523359 CV Score: 0.7474299065420562




C: 20 Train Score: 0.9905115217236212 CV Score: 0.7483841181902124




C: 40 Train Score: 0.9979011774750097 CV Score: 0.7464530892448512




C: 70 Train Score: 0.9995377777777779 CV Score: 0.7424620267513036




C: 100 Train Score: 0.999857782834388 CV Score: 0.7406486608656345
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 


The model with C = 20 gives the best score on the validation set. So, going further, we take C = 20.

If you notice, here we have used the default L2 penalty for regularization as the model with L2 gave me the best result among L1, L2, and elastic-net mixing.

In [26]:
def get_best_thresholds(true, pred):
    thresholds = [i/100 for i in range(100)]
    best_thresholds = []
    for idx in range(25):
        f1_scores = [f1_score(true[:, idx], (pred[:, idx] > thresh) * 1) for thresh in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_thresholds.append(best_thresh)
    return best_thresholds

In [28]:
clf = OneVsRestClassifier(LogisticRegression(C = 20, solver = 'sag'))
clf.fit(X_train_data_tfidf, y_train)
y_pred_train_proba = clf.predict_proba(X_train_data_tfidf)
y_pred_cv_proba = clf.predict_proba(X_cv_data_tfidf)
best_thresholds = get_best_thresholds(y_cv.values, y_pred_cv_proba)
print(best_thresholds)



[0.4, 0.29, 0.21, 0.33, 0.15, 0.28, 0.31, 0.25, 0.21, 0.37, 0.25, 0.16, 0.27, 0.44, 0.28, 0.17, 0.44, 0.32, 0.29, 0.27, 0.13, 0.61, 0.3, 0.39, 0.34]


In [35]:
y_pred_cv = np.empty_like(y_pred_cv_proba) 
for i, thresh in enumerate(best_thresholds):
    y_pred_cv[:, i] = (y_pred_cv_proba[:, i] > thresh) * 1
print(f1_score(y_cv, y_pred_cv, average = 'micro'))

0.7721700040866367


Thus, I have managed to obtain a significantly better score using the variable thresholds.

So far I have performed hyper-parameter tuning on the validation set and managed to obtain the optimal hyperparameter (C = 20). Also, I tweaked the thresholds and obtained the right set of thresholds for which the F1 score is maximum.

<b>Making a prediction on the test data using OneVsRest Classifier</b>

In [38]:
X_tr = train_data[['Computer Science', 'Mathematics', 'Physics', 'Statistics', 'preprocessed_abstract']]
y_tr = train_data[target_cols]
X_te = test_data[['Computer Science', 'Mathematics', 'Physics', 'Statistics', 'ABSTRACT']]


# text data encoding
vectorizer.fit(vocab)
X_tr_tfidf = vectorizer.transform(X_tr['preprocessed_abstract'])
X_te_tfidf = vectorizer.transform(X_te['ABSTRACT'])

# stacking 
X_tr_data_tfidf = hstack((X_tr[topic_cols], X_tr_tfidf))
X_te_data_tfidf = hstack((X_te[topic_cols], X_te_tfidf))

In [39]:
# modeling and making prediction with best thresholds
clf = OneVsRestClassifier(LogisticRegression(C = 20)) 
clf.fit(X_tr_data_tfidf, y_tr)
y_pred_tr_proba = clf.predict_proba(X_tr_data_tfidf)
y_pred_te_proba = clf.predict_proba(X_te_data_tfidf)
y_pred_te = np.empty_like(y_pred_te_proba)
for i, thresh in enumerate(best_thresholds):
    y_pred_te[:, i] = (y_pred_te_proba[:, i] > thresh) * 1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [41]:
y_pred_te[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [42]:
X_te['ABSTRACT'][0]

'fundamental frequency (f0) approximation from polyphonic music includes a tasks of multiple-f0, melody, vocal, and bass line estimation. historically these problems have been approached separately, and only recently, with the help of learning-based approaches. we present the multitask deep learning architecture that jointly estimates outputs considering various tasks including multiple-f0, melody, vocal and bass line estimation, and was trained with the help of the large, semi-automatically annotated dataset. we show that a multitask model outperforms its single-task counterparts, and explore a effect of various design decisions inside our approach, and show that it performs better or at least competitively when compared against strong baseline methods.'

In [44]:
ss = pd.read_csv(r"C:\Users\TEJKIRAN\Desktop\DataAnalytics_files\tag_prediction\SampleSubmission.csv")
ss[target_cols] = y_pred_te
ss.to_csv(r"C:\Users\TEJKIRAN\Desktop\DataAnalytics_files\tag_prediction\LR_tfidf10k_L2_C20.csv", index = False)