## Creating the data for training

This notebook illustrates the process of creating a dataset from the arXiv.csv file

In [1]:
import pandas as pd
import re
import time

In [2]:
class dictionary(dict):
    """
    Extends python dictionary in order to have
    index --> word
    but also
    word --> index
    """
    def __init__(self):
        super(dictionary, self).__init__()
        self.index = {}
        self.size = 0
    
    def __setitem__(self, key, value):
        super(dictionary, self).__setitem__(key, value)
        self.index[value] = key
        self.size += 1
    
    def __delitem__(self, key):
        value = super().pop(key)
        ignore = self.index.pop(value)
        self.size -=1

In [3]:
def process_corpus(corpus, context_size, dictionary, fixed_dictionary=False):
    list_of_points = []
    for document in corpus:
        list_of_points += process_document(document, context_size, dictionary, fixed_dictionary)
    return list_of_points


def process_document(document, context_size, dictionary, fixed_dictionary=False):
    """
    Given a dictionary, extract the tuples of words of length equal to
    context_size. Each word is represented by a unique integer number.
    If fixed_dictionary is True, only take consecutive tuples of words 
    being (all of them) in the dictionary.
    Example: 
        document = "This is a new document"
        context_size = 4
        dictionary = {
            0: "this",
            1: "is",
            2: "a",
            3: "new",
            4: "document"
        }
        
        return
            [(0, 1, 2, 3), (1, 2, 3, 4)]
    """
    text = document.lower()
    p = re.compile("[a-z]+")
    tokens = p.findall(text)
    list_of_points = []
    for i in range(len(tokens) - context_size + 1):
        data_point = [0 for l in range(context_size)]
        add_new_data_point = True
        for j in range(context_size):
            k = i+j
            if tokens[k] not in dictionary.index:
                if fixed_dictionary:
                    # only takes series of words in the dictionary
                    add_new_data_point = False
                    break
                else:
                    new_Ix = dictionary.size
                    dictionary[new_Ix] = tokens[k]
            data_point[j] = dictionary.index[tokens[k]]
        if add_new_data_point:
            list_of_points.append(tuple(data_point))
    return list_of_points
        


In [4]:
# Define some important values
CONTEXT_SIZE = 4
DICT_SIZE = 17000

### Read the arXiv dataset

In [5]:
#data = pd.read_csv("./arxiv_articles.csv", sep="|")
data = pd.read_csv("./arxiv_articles_sample.csv", sep="|")

In [6]:
data.head()

Unnamed: 0,id,title,authors,arxiv_primary_category,summary,published,updated,general_category
0,http://arxiv.org/abs/1502.02721v1,In-situ measurements of the radiation stabilit...,P. A. Gerakines;R. L. Hudson;M. H. Moore;J. -L...,astro-ph.IM,We present new kinetics data on the radiolytic...,2015-02-09T23:03:10Z,2015-02-09T23:03:10Z,astro-ph
1,http://arxiv.org/abs/1503.01540v1,Finding meteorite impacts in Aboriginal oral t...,Duane W. Hamacher,physics.hist-ph,Aboriginal stories dating back many thousands ...,2015-03-05T05:21:09Z,2015-03-05T05:21:09Z,physics
2,http://arxiv.org/abs/1909.09824v1,Desperate times call for desperate measures: g...,Sokbae Lee;Yuan Liao;Myung Hwan Seo;Youngki Shin,econ.GN,We investigate state-dependent effects of fisc...,2019-09-21T13:42:17Z,2019-09-21T13:42:17Z,econ
3,http://arxiv.org/abs/1809.07292v2,Online control of the false discovery rate in ...,David S. Robertson;James M. S. Wason,stat.ME,Modern biomedical research frequently involves...,2018-09-19T16:37:46Z,2018-09-26T17:08:01Z,stat
4,http://arxiv.org/abs/1902.10021v1,Gig Economy: A Dynamic Principal-Agent Model,Zsolt Bihary;Péter Kerényi,econ.GN,"The gig economy, where employees take short-te...",2019-02-26T16:05:06Z,2019-02-26T16:05:06Z,econ


In [7]:
s = time.time()
mydict = dictionary()
dataset = process_corpus(data['summary'], CONTEXT_SIZE, mydict)
t = time.time() - s
print("Done in {} seconds".format(int(t)))

Done in 0 seconds


In [8]:
mydict.size

3218

In [9]:
len(dataset)

13528

In [10]:
dataset[:10]

[(0, 1, 2, 3),
 (1, 2, 3, 4),
 (2, 3, 4, 5),
 (3, 4, 5, 6),
 (4, 5, 6, 7),
 (5, 6, 7, 8),
 (6, 7, 8, 9),
 (7, 8, 9, 10),
 (8, 9, 10, 11),
 (9, 10, 11, 12)]

In [11]:
data_df = pd.DataFrame(dataset)

In [12]:
data_df.head()

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,1,2,3,4
2,2,3,4,5
3,3,4,5,6
4,4,5,6,7


In [13]:
# Count the number of occurrences for each word
word_counts = data_df.iloc[:, 0].value_counts()
word_counts

6       924
9       607
22      355
76      320
65      313
       ... 
2058      1
11        1
2066      1
1517      1
2011      1
Name: 0, Length: 3179, dtype: int64

In [14]:
w1 = word_counts.keys()[0]
w2 = word_counts.keys()[1]
w3 = word_counts.keys()[2]
w4 = word_counts.keys()[3]
print("These are the four most frequent words: {}".format((mydict[w1], mydict[w2], mydict[w3], mydict[w4])))

These are the four most frequent words: ('the', 'of', 'and', 'a')


In [15]:
# Now, we want to keep only a subset of all the words
# we define a fixed size for the dictionary and we 
# keep the words, starting from the most frequent ones

words2keep = word_counts.keys()[:DICT_SIZE]

In [16]:
words2keep

Int64Index([   6,    9,   22,   76,   65,   13,  102,    0,   49,  130,
            ...
            2018, 2026, 2034, 2050, 1549, 2058,   11, 2066, 1517, 2011],
           dtype='int64', length=3179)

In [17]:
# Now, we create a new dictionary with the
# words selected in the previous step
new_dictionary = dictionary()
for i in range(len(words2keep)):
    new_dictionary[i] = mydict[words2keep[i]]

In [18]:
# With the new dictionary, build the new training dataset

# Creating the training dataset using series of 4 words 
# appearing in the text
s = time.time()
new_dataset = process_corpus(data['summary'], CONTEXT_SIZE, new_dictionary, fixed_dictionary=True)
t = time.time() - s
print("Done in {} seconds".format(int(t)))

Done in 0 seconds


In [19]:
len(new_dataset)

13463

In [20]:
new_dictionary.size

3179

In [21]:
new_dataset[:10]

[(7, 86, 40, 1175),
 (86, 40, 1175, 19),
 (40, 1175, 19, 11),
 (1175, 19, 11, 0),
 (19, 11, 0, 1670),
 (11, 0, 1670, 2032),
 (0, 1670, 2032, 1),
 (1670, 2032, 1, 223),
 (2032, 1, 223, 3175),
 (1, 223, 3175, 1037)]

### Illustrate how the data is encoded

In [22]:
data['summary'][0]

'We present new kinetics data on the radiolytic destruction of amino acids measured in situ with infrared spectroscopy. Samples were irradiated at 15, 100, and 140 K with 0.8-MeV protons, and amino-acid decay was followed at each temperature with and without H$_2$O present. Observed radiation products included CO$_2$ and amines, consistent with amino-acid decarboxylation. The half-lives of glycine, alanine, and phenylalanine were estimated for various extraterrestrial environments. Infrared spectral changes demonstrated the conversion from the non-zwitterion structure NH$_2$-CH$_2$(R)-COOH at 15 K to the zwitterion structure $^+$NH$_3$-CH$_2$(R)-COO$^-$ at 140 K for each amino acid studied.'

In [23]:
first_words = new_dataset[0]
print([new_dictionary[i] for i in first_words])

['we', 'present', 'new', 'kinetics']


In [24]:
new_dictionary

{0: 'the',
 1: 'of',
 2: 'and',
 3: 'a',
 4: 'to',
 5: 'in',
 6: 'is',
 7: 'we',
 8: 'for',
 9: 'that',
 10: 'this',
 11: 'on',
 12: 'are',
 13: 'with',
 14: 'by',
 15: 'from',
 16: 'as',
 17: 'be',
 18: 'which',
 19: 'data',
 20: 'an',
 21: 'model',
 22: 'can',
 23: 'method',
 24: 'it',
 25: 'our',
 26: 'at',
 27: 'based',
 28: 'results',
 29: 'two',
 30: 'paper',
 31: 'also',
 32: 'have',
 33: 'these',
 34: 'using',
 35: 's',
 36: 'show',
 37: 'methods',
 38: 'time',
 39: 'models',
 40: 'new',
 41: 'not',
 42: 'such',
 43: 'mass',
 44: 'when',
 45: 'high',
 46: 'all',
 47: 'between',
 48: 'than',
 49: 'c',
 50: 'more',
 51: 'selection',
 52: 'both',
 53: 'class',
 54: 'm',
 55: 'one',
 56: 'problem',
 57: 'case',
 58: 'most',
 59: 'large',
 60: 'where',
 61: 'well',
 62: 'first',
 63: 'other',
 64: 'but',
 65: 'through',
 66: 'estimation',
 67: 'distribution',
 68: 'non',
 69: 'has',
 70: 'proposed',
 71: 'learning',
 72: 'algorithms',
 73: 'number',
 74: 'studies',
 75: 'will',
 76: