In [1]:
import itertools
import json
from multiprocessing import Pool
import pickle
import re
import string
from typing import List, Dict, Any, Union

from gensim.models import Word2Vec
import gensim.corpora.textcorpus
import gensim.parsing.preprocessing
from nltk.corpus import stopwords

Dataset = List[Dict[str, Union[int, str, List[str]]]]

In [2]:
def load_dataset(dataset_file: str='../data/dataset.json') -> Dataset:
    with open(dataset_file) as datafile:
        return json.load(datafile)

def split_sentences(text: str) -> List[str]:
    return [sentence.strip() for sentence in text.split('.')]


def extract_titles(data: Dataset) -> List[List[str]]:
    return [split_sentences(record['title'].lower()) for record in data]


def extract_abstracts(data: Dataset) -> List[List[str]]:
    return [split_sentences(record['description'].lower()) for record in data]

In [3]:
data = load_dataset()

In [4]:
data[0]

{'description': 'The production process of almost all modern steels involves austenitization formation of the austenite phase upon continuous heating. Many of the microstructural features and properties that are obtained upon subsequent cooling are to a large extend determined by the evolution of the microstructure and chemical inhomogeneities during austenitization. In spite of its importance, austenitization so far has received much less attention than the transformations on cooling; however, the interest is continuously increasing, especially for the development of new types of steels (Dual-Phase steel, TRansformation-Induced Plasticity steel etc.). The aim of the thesis is to develop knowledge and to gain better understanding of the formation of the austenite microstructure in steel during heating, e.g. austenite nucleation kinetics, austenite growth modes and morphologies, redistribution of carbon between the phases during the transformatio',
 'abstract_word_count': 130,
 'title':

In [5]:
all_ = extract_abstracts(data) + extract_titles(data)

In [6]:
all_[0]

['the production process of almost all modern steels involves austenitization formation of the austenite phase upon continuous heating',
 'many of the microstructural features and properties that are obtained upon subsequent cooling are to a large extend determined by the evolution of the microstructure and chemical inhomogeneities during austenitization',
 'in spite of its importance, austenitization so far has received much less attention than the transformations on cooling; however, the interest is continuously increasing, especially for the development of new types of steels (dual-phase steel, transformation-induced plasticity steel etc',
 ')',
 'the aim of the thesis is to develop knowledge and to gain better understanding of the formation of the austenite microstructure in steel during heating, e',
 'g',
 'austenite nucleation kinetics, austenite growth modes and morphologies, redistribution of carbon between the phases during the transformatio']

In [7]:
# Flatten
all_ = list(itertools.chain.from_iterable(all_))

In [8]:
all_[0]

'the production process of almost all modern steels involves austenitization formation of the austenite phase upon continuous heating'

In [9]:
# Removing non ASCII chars
all_ = [re.sub(r'[^\x00-\x7f]', r' ', sentence) for sentence in all_]

In [10]:
all_[10]

'reported results show that the investigated bainite-austenite steel can be used for constructing add-on armour and that the armour fulfils requirements of protection level 2 of stanag 4569'

In [11]:
# Remove numbers
numbers_pattern = re.compile(r'\d+')
all_ = [numbers_pattern.sub(r'', sentence) for sentence in all_]

In [12]:
all_ = [
        gensim.corpora.textcorpus.strip_multiple_whitespaces(sentence)
        for sentence in all_
]

In [13]:
all_ = [
        gensim.parsing.preprocessing.strip_punctuation2(sentence)
        for sentence in all_
    ]

In [14]:
all_[10]

'reported results show that the investigated bainite austenite steel can be used for constructing add on armour and that the armour fulfils requirements of protection level of stanag '

In [15]:
stops = set(stopwords.words("english"))

all_ = [[word for word in sentence.split() if word not in stops]
            for sentence in all_]

In [16]:
# all_ = [
#         gensim.corpora.textcorpus.remove_short(sentence, minsize=2)
#         for sentence in all_
#     ]

In [17]:
all_[10]

['reported',
 'results',
 'show',
 'investigated',
 'bainite',
 'austenite',
 'steel',
 'used',
 'constructing',
 'add',
 'armour',
 'armour',
 'fulfils',
 'requirements',
 'protection',
 'level',
 'stanag']

In [18]:
# Remove empty sentences
all_ = [sentence for sentence in all_ if len(sentence)]

In [19]:
with open('../data/preprocessed_dataset.pickle', 'wb') as datafile:
    pickle.dump(all_, datafile)