**IMPORTING LIBERARIES**

In [1]:
import pandas as pd
import numpy as np


**IMPORTING DATA**

In [2]:
df = pd.read_csv('papers.csv')
print(df.shape)
df.head()

(7241, 7)


Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


**PREPROCESSING DATA**

In [3]:
import re 
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\13zer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\13zer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
stopwords = stopwords.words("english")
print(stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
def preprocessing(txt): 
    txt = re.sub('[^a-zA-Z]', ' ', txt)
    txt = txt.lower()
    txt = nltk.word_tokenize(txt)
    txt = [word for  word in txt if len(word)>3]
    ps = PorterStemmer()
    txt = [ps.stem(word) for word in txt if not word in set(stopwords)]
    txt = ' '.join(txt)  
    return txt


In [6]:
doc = df['paper_text'].apply(lambda x:preprocessing(x))
doc

0       self organ associ databas applic hisashi suzuk...
1       mean field theori layer visual cortex applic a...
2       store covari associ long term potenti depress ...
3       bayesian queri construct neural network model ...
4       neural network ensembl cross valid activ learn...
                              ...                        
7236    singl transistor learn synaps paul hasler chri...
7237    bia varianc combin least squar estim ronni mei...
7238    real time cluster cmo neural engin serrano got...
7239    learn direct global motion class psychophys mo...
7240    correl interpol network real time express anal...
Name: paper_text, Length: 7241, dtype: object

**USING COUNTVECTORIZER TO GET COUNT OR OCCURANCE OF WORDS IN FORM OF VECCTOR**

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

 # the word occured more than 95% of the document will be removed and it takes ngram for 1 to 3
count_vectorizer = CountVectorizer(max_df=0.95, max_features=5000 ,ngram_range = (1,2))
word_count_vector = count_vectorizer.fit_transform(doc)
word_count_vector


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4620512 stored elements and shape (7241, 5000)>

**RANKING WORDS WITH THE IMPORTANCE**

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer

#use_idf=True gives highest value to least occured word
#prevent division by 0 smooth_idf=True
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) 
tfidf = tfidf_transformer.fit(word_count_vector)
tfidf


**GET FEATURE NAME**

In [40]:
feature_names=count_vectorizer.get_feature_names_out()

**EXPORTING MODELS**

In [41]:
import pickle 
pickle.dump(tfidf, open("tfidf.pkl", "wb"))
pickle.dump(count_vectorizer, open("count_vectorizer.pkl", "wb"))
pickle.dump(feature_names, open("feature_names.pkl", "wb"))