# Data Set link : https://www.kaggle.com/datasets/benhamner/nips-papers

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("DataSet/new_papers.csv")
df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [8]:
df.shape

(5000, 7)

In [9]:
df = df.iloc[:5000,:] # Reducing the data size
df.shape
df.to_csv("DataSet/new_papers.csv", index=False)

In [10]:
df['paper_text'][0]

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

## Processing and Extracting the data from paper text
### Steps Include
### 1. Lower Case
### 2. Remove HTML
### 3. Remove Special Charachters
### 4. Tokenization
### 5. Remove Stopwords
### 6. Remove words less than 3 Letters
### 7. Lemmatize

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [12]:
Stop_Words = set(stopwords.words('english'))

In [13]:
# Few words that are in document that acts as a stop words
new_words = ["fig", "figure", "sample","using", "image", "show", "result", "large", "also", "one", "two","three","four","five","six","seven","eight","nine"]

Stop_Words = list(Stop_Words.union(new_words))

In [14]:
def processing_text(txt):
    txt = txt.lower() # To Lover the text
    txt = re.sub(r'<.*?>' , ' ', txt)  # Replacing all the special Characters with the empty ones <...>
    txt = re.sub(r'[^a-zA-Z]',' ',txt)
    txt = nltk.word_tokenize(txt) # Seperating with comma'%%SVG
    txt = [word for word in txt if word not in Stop_Words]
    txt = [word for word in txt if len(word) >3] # Removing Words less than 3 sized

    stemming = PorterStemmer()
    txt = [stemming.stem(word) for word in txt] # loving ->>love
    return txt

processing_text("hello dear, how are you <p> I am fine loving</p>")

['hello', 'dear', 'fine', 'love']

In [15]:
docs = df['paper_text'].apply(lambda x:processing_text(x))

KeyboardInterrupt: 

## Using TF-IDF
### TF-IDF stands for total frequency inverse document frequency. The importance of each word increases the proportion to the number of times a word appears

### CountVectorizer
### For the task use countvectorizer in scikit-learn

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=95, max_features = 5000, ngram_range=(1,3)) 
#ngram means combination of repetative ocurring words unigram single word eg. "I", "Love", bigram 2 words eg. => "i love" and trigram => eg. "I love to"
word_count_vectors = cv.fit_transform(docs)

NameError: name 'docs' is not defined

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
Tfidf_Transformer = Tfidf_Transformer(smooth_idf=True, use_idf = True)
Tfidf_Transformer =TfidfTransformer.fit(word_count_vectors)