# Mounting google drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load modules

In [0]:
import numpy as np
import pandas as pd

import re
import nltk
import unicodedata

# Load and explore data

In [0]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/text-generation/text-generation-word/nips.csv')
df = df[df['Abstract'] != "Abstract Missing"]
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4933 entries, 0 to 4932
Data columns (total 3 columns):
Year        4933 non-null int64
Title       4933 non-null object
Abstract    4933 non-null object
dtypes: int64(1), object(2)
memory usage: 115.7+ KB


In [0]:
text = df['Abstract']
text.head()

0    Up-propagation is an algorithm for inverting ...
1    We have constructed an inexpensive video based...
2    Non-negative matrix factorization (NMF) has pr...
3    Spike-triggered averaging techniques are effec...
4    We consider continuous state, continuous actio...
Name: Abstract, dtype: object

In [0]:
corpus_init = ' '.join(list(text))
words_init = corpus_init.split()
n_words_init = len(words_init)
unique_words_init = sorted(list(set(words_init)))
n_unique_words_init = len(unique_words_init)
print("Total number of words before text preprocessing:", n_words_init)
print("Total number of unique words before text preprocessing: ", n_unique_words_init)
print(unique_words_init[:100])
print(unique_words_init[100:200])
print(unique_words_init[200:300])

Total number of words before text preprocessing: 732614
Total number of unique words before text preprocessing:  41882
['"', '"ALBO"', '"Air', '"Air\'\'', '"Answerer', '"DIRECT"', '"DUOL"', '"Expansion-Constrained', '"Generalized', '"Ghost', '"Graph', '"GuessWhat?!".', '"Hedge"', '"Hey', '"I', '"Ising\'\'', '"MNIST', '"Object', '"PixelGAN', '"Self-Expressiveness', '"Short-Dot"', '"TILT\'\'', '"Ugliness-in-Averageness"', '"What', '"additional', '"anti-Bayesian"', '"autotags")', '"averagers,"', '"batch"', '"best', '"body\'\'', '"body\'\',', '"building', '"bus', '"calibration', '"catalyst"', '"chill"', '"comparison', '"condition', '"context",', '"convergence', '"cooperative', '"correctness"', '"date"', '"deep', '"degrees', '"deltas",', '"denoise"', '"describing"', '"determinantal', '"disagreement', '"disappearance"', '"discriminating"', '"discriminative', '"early', '"edge', '"efficient"', '"em', '"equalized', '"exemplars"', '"expected', '"extended', '"external"', '"fair."', '"few-shot\'\'

# Text Preprocessing

In [0]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        if word != ',' or word != '.':
          new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
          new_words.append(new_word)
    return new_words

In [0]:
def clean_text(text):
  # noise removal
  text = re.sub(r'\bhttps?://\w+.+[^ ]\b', 'link', text) #replace url with "link"
  text = re.sub(r'[a-zA-Z0-9]*\.?github\.?[a-zA-Z0-9]*','github',text) #replace github links with "github"
  text = re.sub(r'\~\\cite\{[^}]*\}','cite',text) # remove cite in the format of "~\cite{DeSaOR16}"
  text = re.sub(r'\[[^]]*\]', '', text) # remove between square brackets
  text = re.sub(r'\([^)]*\)', '', text) # remove between parentheses
  text = re.sub(r'\{[^)]*\}', '', text) # remove between curly brackets
  
  # normalization
  text = text.lower() # convert to lowercase text
  
  text = re.sub(r'\-',' ', text) # seperate words like 'video-related'
  text = re.sub(r'\/',' ', text) # seperate words like 'descriptors/tags'
  text = re.sub(r'[^a-zA-Z0-9\s\.\,]', '', text) # remove punctuation 
  text = re.sub(r'seq2seq', 'seqtoseq', text)
  
  text = re.sub(r'[-+]?\d*\.?\d+', 'NUMBER', text) # replace numbers with "NUMBER"
  
  text = re.sub(r'\.{2,}', '', text) # remove '..','...'
  text = re.sub(r'\.', ' . ', text) # seperate '.' from text
  text = re.sub(r'\,' , ' , ', text) # seperate ',' from text
 
  text = ' '.join(remove_non_ascii(text.split())) # remove non-ascii words
  text = re.sub(r'[\w]*NUMBER[\w]*', 'NUMBER', text) # replace anyword containing "NUMBER" with "NUMBER"

  # convert from British English to American English
  text = re.sub(r'modelled', 'modeled', text) 
  text = re.sub(r'modelling', 'modeling', text)
  text = re.sub(r'parallelisation', 'parallelization', text) 
  text = re.sub(r'parallelising', 'parallelizing', text)
  text = re.sub(r'analysed', 'analyzed', text) 
  text = re.sub(r'generalised', 'generalized', text) 
  text = re.sub(r'maximisation', 'maximization', text)
  text = re.sub(r'recogniser', 'recognizer', text)
  text = re.sub(r'optimised', 'optimized', text)
  text = re.sub(r'analyse', 'analyze', text)
  text = re.sub(r'generalisation', 'generalization', text)
  text = re.sub(r'generalised', 'generalized', text)
  text = re.sub(r'factorisation', 'factorization', text)
  text = re.sub(r'behaviour', 'behavior', text)
  text = re.sub(r'interpretted', 'interpreted', text)
  text = re.sub(r'neighbouring', 'neighboring', text)
  text = re.sub(r'neighbour', 'neighbor', text)
  text = re.sub(r'neighbours', 'neighbors', text)
  text = re.sub(r'dependant', 'dependent', text)
  text = re.sub(r'localisation', 'localization', text)
  text = re.sub(r'amortised', 'amortized', text)
  text = re.sub(r'amortisation', 'amortization', text)
  text = re.sub(r'neutralising', 'neutralizing', text)
  text = re.sub(r'prioritised', 'prioritized', text)
  text = re.sub(r'characterised', 'characterized', text)
  text = re.sub(r'characterise', 'characterize', text)
  text = re.sub(r'centeralised', 'centeralized',text)
  text = re.sub(r'initialisation', 'initialization', text)
  text = re.sub(r'initialised', 'initialized', text)
  text = re.sub(r'regularisation', 'regularization', text)
  text = re.sub(r'regularised', 'regularized', text)
  text = re.sub(r'optimisation', 'optimization', text)
  text = re.sub(r'optimise', 'optimize', text)
  text = re.sub(r'minimisation', 'minimization', text)
  text = re.sub(r'generalises', 'generalizes', text)
  text = re.sub(r'parameterised', 'parameterized', text)
  text = re.sub(r'parameterises', 'parameterizes', text)
  text = re.sub(r'reparameterisation', 'reparameterization', text)
  text = re.sub(r'optimising', 'optimizing', text)
  text = re.sub(r'favourable', 'favorable', text)
  text = re.sub(r'hypothesised', 'hypothesized', text)
  text = re.sub(r'summarise', 'summarize', text)
  text = re.sub(r'standardised', 'standardized', text)
  text = re.sub(r'randomisation', 'randomization', text)
  text = re.sub(r'synchronisation', 'synchronization', text)
  text = re.sub(r'travelling', 'traveling', text)

  return text

In [0]:
text = text.apply(clean_text)

In [0]:
text_list = list(text)
corpus = ' '.join(text_list)
words = corpus.split()
n_words = len(words)
unique_words = sorted(list(set(words)))
n_unique_words = len(unique_words)
print("Total number of words:", n_words)
print("Total number of unique words: ", n_unique_words)
print(unique_words[:100])
print(unique_words[100:200])
print(unique_words[200:300])
print(unique_words[300:400])
print(unique_words[400:500])
print(unique_words[500:600])
print(unique_words[600:700])

Total number of words: 784009
Total number of unique words:  15032
[',', '.', 'NUMBER', 'a', 'aa', 'aaai', 'aalen', 'aaronson', 'ab', 'abandons', 'abbe', 'abc', 'abdominal', 'aberrant', 'abf', 'abilities', 'ability', 'ablation', 'able', 'abnormal', 'abnormalities', 'abnormality', 'abound', 'abounds', 'about', 'above', 'abovethreshold', 'abp', 'abrupt', 'abscissa', 'absence', 'absent', 'absolute', 'absolutely', 'absorb', 'absorbed', 'absorbing', 'absorption', 'abstain', 'abstaining', 'abstention', 'abstract', 'abstracted', 'abstraction', 'abstractions', 'abstractive', 'abstracts', 'abundance', 'abundancy', 'abundant', 'abuse', 'ac', 'academic', 'academics', 'accelerate', 'accelerated', 'accelerates', 'accelerating', 'acceleration', 'accelerators', 'accelerometers', 'accentuated', 'accept', 'acceptability', 'acceptable', 'acceptably', 'acceptance', 'accepted', 'accepts', 'access', 'accessed', 'accesses', 'accessibility', 'accessible', 'accessing', 'accident', 'accidental', 'accidents', '

# Save clean text to file

In [0]:
# save clean text to .txt for model training
text_str = '\n'.join(text_list)
with open("nips_clean.txt", 'w') as file:
  file.write(text_str)