In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [3]:
df = data["ABSTRACT"]
df

0          Predictive models allow subject-specific inf...
1          Rotation invariance and translation invarian...
2          We introduce and develop the notion of spher...
3          The stochastic Landau--Lifshitz--Gilbert (LL...
4          Fourier-transform infra-red (FTIR) spectra o...
                               ...                        
20967      Machine learning is finding increasingly bro...
20968      Polycrystalline diamond coatings have been g...
20969      We present a new approach for identifying si...
20970      The sum of Log-normal variates is encountere...
20971      Recently, optional stopping has been a subje...
Name: ABSTRACT, Length: 20972, dtype: object

In [4]:
#preprocessing data
def preprocess(text):
    token = [nltk.word_tokenize(word) for word in text]
    stop = set(stopwords.words("english"))
    token_1 = []
    for words in token:
        l = []
        for w in words:
            if w not in stop:
                l.append(w)
        token_1.append(l)
    punct = set(string.punctuation)
    token_2 = []
    for words in token_1:
        l = []
        for w in words:
            if(w not in punct):
                l.append(w)
        token_2.append(l)
    preprocess_ = []
    for i in token_2:
        preprocess_.append(" ".join(i))
    return preprocess_

In [5]:
preprocess_data = preprocess(df)
preprocess_data[1]

'Rotation invariance translation invariance great values image recognition tasks In paper bring new architecture convolutional neural network CNN named cyclic convolutional layer achieve rotation invariance 2-D symbol recognition We also get position orientation 2-D symbol network achieve detection purpose multiple non-overlap target Last least architecture achieve one-shot learning cases using invariance'

In [6]:
#vectorization
tfidf = TfidfVectorizer(stop_words = "english",max_df = .80,max_features=5000,use_idf = False,norm = None)
tf_vectors = tfidf.fit_transform(preprocess_data)

In [7]:
#modal training
lda_model = LatentDirichletAllocation(n_components=6, random_state=42)

In [8]:
lda_model.fit(tf_vectors)

In [9]:
lda_model.components_

array([[  0.16721459,   0.16670807,   0.16753603, ..., 105.21298685,
          0.16668272,   0.16703793],
       [ 74.07006683,   0.16721701,   1.43885306, ...,   0.16673108,
          0.16670005,   2.83625099],
       [ 30.91039945,   0.16677516,   0.16850936, ...,   0.16676447,
          0.16667221,   0.1670614 ],
       [  0.16797616,   0.16692162,   0.16734054, ...,   0.16710724,
          0.1668471 ,   5.30623839],
       [ 15.87938477,  69.16529373,  27.02480069, ...,  25.11948403,
         50.16613079, 104.42729261],
       [ 73.80495821,   0.16708441,  34.03296032, ...,   0.16692633,
          0.16696712,  46.09611868]])

In [10]:
lda_model.components_.shape

(6, 5000)

In [11]:
text = ["involving both unsupervised and supervised\nsettings.oatings have been g...We present a new approach for identifying si...The sum of Log-normal variates is encountere As a result, we achieved state-of-the-art results on all three of\nthese tasks. Our code and trained"]
cleaned_data = preprocess(text)
cleaned_data

['involving unsupervised supervised settings.oatings g ... We present new approach identifying si ... The sum Log-normal variates encountere As result achieved state-of-the-art results three tasks Our code trained']

In [12]:
vector = tfidf.transform(cleaned_data)

In [18]:
topic_distribution = lda_model.transform(vector)
predicted_topic = np.argmax(topic_distribution)
predicted_topic
        
    

2

In [22]:
top_topic_words = [tfidf.get_feature_names_out()[i] for i in lda_model.components_[predicted_topic].argsort()[-10:][::-1]]
textual_info = "The top words in the predicted topic are: " + ", ".join(top_topic_words)
textual_info

'The top words in the predicted topic are: learning, algorithm, method, network, based, data, problem, networks, neural, model'

In [24]:
text_1 = ["c relation. Using superior measures (i.e., with lower measurement error than existing measures) reduces attenuation bias, which leads to more precise estimates of the parameters describing an economic relationship. Novel measures enable new analyses with previously unmeasurable economic aspects. In the main analysis, most studies that construct ML-based measures apply traditional econometric methods such as linear regression with OLS.Table 2 presents a selection of studies that use ML to construct superior or novel measures. In the following, we present them in three categories: (1) measures of sentiment, (2) measures of corporate executives’ characteristics, and (3) measures of firm characteristics"]
test_data = preprocess(text_1)
test_data

['c relation Using superior measures i.e. lower measurement error existing measures reduces attenuation bias leads precise estimates parameters describing economic relationship Novel measures enable new analyses previously unmeasurable economic aspects In main analysis studies construct ML-based measures apply traditional econometric methods linear regression OLS.Table 2 presents selection studies use ML construct superior novel measures In following present three categories 1 measures sentiment 2 measures corporate executives ’ characteristics 3 measures firm characteristics']

In [25]:
vector_test = tfidf.transform(test_data)

In [26]:
topic_distribution_test = lda_model.transform(vector_test)
predicted_topic_test = np.argmax(topic_distribution_test)
predicted_topic_test
        

1

In [28]:
top_topic_words_test = [tfidf.get_feature_names_out()[i] for i in lda_model.components_[predicted_topic_test].argsort()[-10:][::-1]]
textual_info_test = "The top words in the predicted topic are: " + ", ".join(top_topic_words_test)
textual_info_test

'The top words in the predicted topic are: data, model, based, paper, models, approach, using, information, analysis, used'