In [4]:
# ALL IMPORTS

# -----------------------------------
# SpaCy IMPORTS
!pip install -U spacy

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download en_core_web_lg

# Check how many stopwords
#len(STOP_WORDS)

# Create a spaCy nlp object
nlp = spacy.load('en_core_web_lg')

# Check if a word is a stopword:
#nlp.vocab['thy'].is_stop

# Add custom stopwords
new_stop_words = ['thy', 'ye', 'thee', 'thou', 'll', 've']
for w in new_stop_words :
  STOP_WORDS.add(w)

# -----------------------------------
# TRANSFORMERS IMPORTS
#!pip install transformers datasets
!pip install --no-cache-dir transformers datasets sentencepiece

from transformers import (pipeline, 
                          AutoModel, 
                          AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          AutoModelForCausalLM, 
                          DataCollatorForLanguageModeling, 
                          TrainingArguments, 
                          Trainer,
)

from datasets import load_dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-11-02 06:38:33.558179: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 16 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import os
!ls

nietzsche.txt  sample_data


# Topic Extractor + Summarization

## Load data / text preprocessing

In [6]:
# Load list of paragraphs
import os

def txt2paragraph(filepath):
    with open(filepath) as f:
        lines = f.readlines()

    paragraph = ''
    for line in lines:
        if line.isspace():  # is it an empty line?
            if paragraph:
                yield paragraph
                paragraph = ''
            else:
                continue
        else:
            paragraph += ' ' + line.strip()
    yield paragraph

# Build the list of raw paragraphs
raw_paragraphs = []
for par in txt2paragraph('nietzsche.txt'):
  par = par.strip()
  raw_paragraphs.append(par)

# Basic preprocessing
import re
def paragraphPreprocess(raw_paragraphs: list):
  paragraphs = []
  prev = ''
  for par in raw_paragraphs:
    # First we exclude short paragraphs and Footnotes
    if (len(par) < 200 and not prev) or "Footnote" in par or 'NOTE' in par or 'Nietzsche' in par: 
      continue 
    # Next remove non-alpha characters at the beginning of each paragraph
    else:
      for c in par:
        if c.isalpha():
          i = par.find(c)
          par = par[i:]
          break
      par = re.sub('[—]', ' ', par)
      par = par.replace('-', ' ')
      par = par.replace('”', '')
      par = par.replace("’", '')
      # Remove text between square brackets: 
      # "[\(\[].*?[\)\]]"  is a REGEX for finding
      # the pattern for brackets containing some content
      par = re.sub("[\(\[].*?[\)\]]","", par)
      par = re.sub('[_\'{}()…="]', '', par)
      par = prev + ' ' + par
      par = par.strip()
      if par[-1] in [':', ';', ','] or par[-1].isalpha():
        prev = par
        continue
      else:
        paragraphs.append(par)
        prev = ''
  return paragraphs

paragraphs = paragraphPreprocess(raw_paragraphs)

# Create a Pandas DataFrame out of our list of paragraphs
import pandas as pd
df = pd.DataFrame(paragraphs, columns =['paragraph'])


# ----------------------------------------------------
### Tokenization and further preprocessing with SpaCy

# Split text into a list of SENTENCES with SpaCy
def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]

# Further preprocessing with SpaCy
import string
def clean_text(text):
    '''
    Make text lowercase, remove text in square brackets, 
    remove punctuation and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df_clean = pd.DataFrame(df.paragraph.apply(lambda x: clean_text(x)))

def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)
    
df["paragraph_lemmatize"] =  df_clean.apply(lambda x: lemmatizer(x['paragraph']), axis=1)
#df.head()

## Topic extraction: Non-negative Matrix Factorization

In [7]:
# Create a DOCUMENT TERM MATRIX
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=3, stop_words=STOP_WORDS)

dtm = tfidf.fit_transform(df['paragraph_lemmatize'])

# Create an instance of NMF with n_comp components
from sklearn.decomposition import NMF
n_comp = 8
nmf_model = NMF(n_components=n_comp,random_state=42)
nmf_model.fit(dtm)

# Print the most common words for each topic
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-20:]])
    print('\n')
   
# ASSIGN Topic to paragraphs and COUNT paragraphs/topic

topic_results = nmf_model.transform(dtm)
df['topic'] = topic_results.argmax(axis=1)

df.groupby(['topic']).size()

#df.head(10)
#df[df['topic'] == topic].head() # Filter df by topic



THE TOP WORDS FOR TOPIC #0
['general', 'state', 'stand', 'essential', 'divine', 'order', 'come', 'class', 'thing', 'second', 'way', 'etc', 'worth', 'specie', 'belief', 'high', 'morality', 'life', 'man', 'power']


THE TOP WORDS FOR TOPIC #1
['religion', 'grow', 'find', 'turn', 'high', 'increase', 'new', 'lose', 'live', 'vain', 'thing', 'morality', 'lead', 'hitherto', 'goal', 'valuation', 'moral', 'long', 'nihilism', 'value']


THE TOP WORDS FOR TOPIC #2
['freedom', 'ideal', 'appearance', 'belief', 'reality', 'god', 'understand', 'remain', 'false', 'truth', 'existence', 'corruption', 'reason', 'serve', 'mean', 'real', 'true', 'evil', 'purpose', 'world']


THE TOP WORDS FOR TOPIC #3
['create', 'culture', 'decadence', 'gloominess', 'weakness', 'theory', 'main', 'sign', 'attain', 'way', 'vice', 'ought', 'problem', 'right', 'speak', 'symptom', 'illness', 'people', 'question', 'pessimism']


THE TOP WORDS FOR TOPIC #4
['eye', 'spirit', 'war', 'progress', 'rich', 'thing', 'fact', 'time', 'vic



topic
0    21
1    29
2    14
3    10
4    25
5    45
6    21
7    15
dtype: int64

## Text to Paragraph

In [8]:
def textToParagraph(text):
  """
  Input: text - a string of text
  Output: Full Paragraph from the DataFrame that best matches with input text

  Description: 
  text -> predict Topic -> compute similarity ONLY with paragraphs in this Topic
       -> return paragraph with highest similarity score
  """

  # PREDICT Topic
  text = lemmatizer(clean_text(text))
  X = tfidf.transform([text]) # transform the TF-IDF
  nmf_features = nmf_model.transform(X) # get the nmf_features (score) vector
  topic = nmf_features.argmax()

  # Compute SIMILARITY with paragraphs in this Topic

  # Similarity function: (text, paragraph)
  def sim(text, par):
    # Clean text, remove stopwords and tokenize
    doc = nlp(lemmatizer(clean_text(par)))
    return nlp(text).similarity(doc)

  # Get series of similarity scores on the DataFrame sliced by topic
  scores = df[df['topic'] == topic].apply(lambda x: sim(text, x['paragraph']), axis=1)

  # Return most similar paragraph
  id = scores.idxmax() # Get id of the max score
  return df['paragraph'][id]

# Summarization

In [9]:
# Load summarization pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_1_name = "facebook/bart-large-cnn"
tokenizer_1 = AutoTokenizer.from_pretrained(model_1_name)
model_1 = AutoModelForSeq2SeqLM.from_pretrained(model_1_name)
  
summarizer = pipeline("summarization", model=model_1_name, tokenizer=tokenizer_1)

# TRUNCATE string to TWO SENTENCES
def firstTwoSentences(s: str):
  """
  Input: string s
  Returns: first two sentences in s
  """
  # Truncate string at the last '.' or '?' or '!'
  reverse = s[::-1]
  LastFullStop = reverse.find(".")
  LastQmark = reverse.find("?")
  LastXmark = reverse.find("!")
  lastStopSymbol = max(LastFullStop, LastQmark, LastXmark)

  if lastStopSymbol >= 0 :
    s = s[: len(s) - lastStopSymbol]

  # Return the first two sentences
  return ' '.join(split_in_sentences(s)[:2])

# SUMMARY snippet
def summary(paragraph: str):
  """
  Input: paragraph - a string
  Output: summary - a string. The summary of the paragraph.
  Parameters: we set min_length to 10% of the paragraph's length and max_length to 40% respectively
  """
  min_length=len(tokenizer_1(paragraph)['input_ids']) // 10
  max_length= 5*min_length
  summary = summarizer(paragraph, min_length=min_length, max_length=max_length)[0]['summary_text']
  return firstTwoSentences(summary)

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [10]:
# Print text by sentences
def printBySentence(text: str):
  for sent in split_in_sentences(text):
    print(sent)
  return

# TEST SUMMARIZER with a few paragraphs
import random
for step in range(3):
  t = random.randint(0, len(df['paragraph']))
  paragraph = df['paragraph'][t]
  print("PARAGRAPH:")
  printBySentence(paragraph)

  print("\nSUMMARY:")
  printBySentence(summary(paragraph))
  print(100*'-')

PARAGRAPH:
Artificial modification of ones own nature in order to make it resemble a mirror; one is interested, but only epidermally: this is systematic coolness, equilibrium, a steady low temperature, just beneath the thin surface on which warmth, movement, storm, and undulations play.

SUMMARY:
A mirror is an artificial modification of ones own nature in order to make it resemble a mirror.
----------------------------------------------------------------------------------------------------
PARAGRAPH:
Artificial modification of ones own nature in order to make it resemble a mirror; one is interested, but only epidermally: this is systematic coolness, equilibrium, a steady low temperature, just beneath the thin surface on which warmth, movement, storm, and undulations play.

SUMMARY:
A mirror is an artificial modification of ones own nature in order to make it resemble a mirror.
----------------------------------------------------------------------------------------------------
PARAGRAP

# Text to (relevant) paragraph

In [11]:
# TEST

testQuestion = []
testQuestion.append('How much power should people have in european politics?')
testQuestion.append('Can religion save the evil in the world?')
testQuestion.append('What is the meaning of Love, to live in a more peaceful world?')
testQuestion.append('Is Nihilism an alternative to hope?')

for text in testQuestion:
  print(100*'-')
  print("Input: ", text, '\n') 
  paragraph = textToParagraph(text)
  printBySentence(summary(paragraph))

----------------------------------------------------------------------------------------------------
Input:  How much power should people have in european politics? 

Henrik Ibsen never dared to ring himself free from moral illusionism which says freedom, and will not admit, even to itself, what freedom is.
----------------------------------------------------------------------------------------------------
Input:  Can religion save the evil in the world? 

Kant seems to have needed the hypothesis of intelligible freedom, in order to relieve the ens perfectum of
----------------------------------------------------------------------------------------------------
Input:  What is the meaning of Love, to live in a more peaceful world? 

Nihilism will have to manifest itself as a psychological condition, first when we have sought in all that has happened a purpose which is not there.
Nihilism is therefore the coming into consciousness of the long waste of strength, the pain of futility, unce

# Question Generator

In [17]:
# Load the TOKENIZER
model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

In [23]:
# GENERATE QUESTION WITH FINE-TUNED MODEL

# Load the fine-tuned model from local (a file pytorch_model.bin must be in the current directory)
model = AutoModelForCausalLM.from_pretrained("./")

# Snippet to TRUNCATE strings to QUESTION mark
def questionTruncate(s: str):
  """
  String polisher for outputting clean questions.
  Input: string s
  Returns: string s truncated at the FIRST "?" char or at the LAST "." char
  """
  reverse = s[::-1]
  LastFullStop = reverse.find(".")
  FirstQuestMark = s.find("?")
  if FirstQuestMark >= 0 :
    return s[: FirstQuestMark +1 ]
  elif LastFullStop >= 0 :
    return s[: len(s) - LastFullStop]
  return s

# Test
#for s in ['Multiple? more than one?', 'First? Then no question.', 'No punctuation', 'No question.']:
#  print(questionTruncate(s))


# QUESTION GENERATOR

def questionGenerator(text: str):
  """
  Input:
  Returns:
  """
  # ENCODE input and add 'end-of-string' token
  input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
  l = len(input_ids)
  # GENERATE
  chat_history_ids = model.generate(
      input_ids,
      max_length=l+80,
      do_sample=True,
      top_p=0.91,
      top_k=10,
      temperature=0.75,
      early_stopping=True, #####
      pad_token_id=tokenizer.eos_token_id
  )

  # DECODE to string
  output = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
  return output

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
# Test
for text in ['What does Nihilism mean?', 'What is Pessimism', 'What is the purpose?', 'Prejudice against science?']:
  print(f"Text: {text} \n\nBot: {questionGenerator(text)} \n", 100*'-')

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What does Nihilism mean? 

Bot: What does the term for the word for the highest value of a philosopher and philosopher?In the modern context, what does the word for a philosopher and philosopher mean?The word for the highest value of a philosopher.In the modern context, what is the meaning of the word for the highest value of a philosopher?The term for the highest value of a philosopher is 
 ----------------------------------------------------------------------------------------------------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What is Pessimism 

Bot: I am the old woman who has just returned home from a long journey to the moon. I am a little disappointed that my friends and I have to do this for you: why?...What?What?Because, I mean, is it a question of morality?...But what is the meaning of that term?Because the meaning of this question is to make one wish 
 ----------------------------------------------------------------------------------------------------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What is the purpose? 

Bot: The question: What is the meaning of the word, and what is the meaning of the word, for the sake of the word?The question: What is the meaning of the word?The question: What is the meaning of the word?The word: What is the meaning of the word?The question: What is the meaning of the word?The meaning of the 
 ----------------------------------------------------------------------------------------------------
Text: Prejudice against science? 

Bot: The following is a guest post by Richard Dawkins, a philosopher, who is now a philosopher and philosopher. I am sure that you will have noticed that the most important point of view is that of the Christian, the Christian, the Christian is not Christian. Why?Because, in this respect, we are talking about a man. What is the meaning of this word? 
 ----------------------------------------------------------------------------------------------------


# PhilosopherBot

In [27]:
"""
MAIN LOGIC IS AS FOLLOWS:
input -> topic -> relevant paragraph 
-> use last sentence of paragraph to generate question 
-> print(question, paragraph, summarization of previous two)
"""

def bot(text):
  paragraph = textToParagraph(text)
  summarized_paragraph = summary(paragraph)

  print(summarized_paragraph)

  lastSentence = split_in_sentences(summarized_paragraph)[-1]
  question = questionGenerator(lastSentence)
  question = questionTruncate(question)
  for sent in split_in_sentences(question):
    print(sent)
  return

In [28]:
testQuestion = []
text = 'Tell me about God and the meaning of Life'
for step in range(5):
  testQuestion.append(text)
  text = questionGenerator(text)

for text in testQuestion:
  print("Input text:\n" + text)
  print("Bot: \n")
  bot(text)
  print(100 * '-')

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Input text:
Tell me about God and the meaning of Life
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Sensitiveness is infinitely more acute, the abundance of different impressions is greater than ever. The cosmopolitanism of articles of diet, of literature, newspapers, forms, tastes, and even landscapes.
What is the meaning of the word, the word for the most dangerous and dangerous word?
----------------------------------------------------------------------------------------------------
Input text:
The first thing I want to do is to give you an idea of the value of a story. Why should I care if I am not a philosopher?...I mean, the term is the word for a philosopher, and it is not a word for an artist. The word for a philosopher is, however, a word for the philosopher.The word
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


The courage of all one really knows comes but late in life. The energy and thoroughness with which I marched forward as a Nihilist deceived me concerning this fundamental principle.
The idea that there is a God-worship is a paradox: what is it that makes the world of the world?
----------------------------------------------------------------------------------------------------
Input text:
A new book is coming out of the book
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


The gregarious instinct, then,  now a sovereign power, is something totally different from the instinct of an aristocratic society.
We are now ready to give you a new look at the history of the German Empire: what did the German Empire do to the German people?
----------------------------------------------------------------------------------------------------
Input text:
Anxiety, a feeling that, as we know it, is not always a good time for me to give you a short explanation of the meaning of a word for it. What is the meaning of this expression, the meaning of which is a German word?It is a very personal question, as well as a question of meaning in German. To be
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


European Pessimism is still in its infancy, says the writer. It has not yet attained to that prodigious and yearning fixity of sight.
In this case, the German term is German: the word for the German word for the German word for a man and woman, the German word for the German word for a woman and man.
----------------------------------------------------------------------------------------------------
Input text:
This is the first time I have ever heard
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


We despise ourselves only because we are unable at every moment of our lives to quell that absurd emotion which is called Idealism.
What is the significance of this word?
----------------------------------------------------------------------------------------------------


In [29]:
testQuestion = []
testQuestion.append('How much power should people have in european politics?')
testQuestion.append('Can religion save the evil in the world?')
testQuestion.append('What is the meaning of Love, to live in a more peaceful world?')
testQuestion.append('Is Nihilism an alternative to hope?')
testQuestion.append('If children grow up in this World, how can we teach them how to live more sustainably?')

for text in testQuestion:
  print(100 * '-')  
  print(text + '\n')
  bot(text)


----------------------------------------------------------------------------------------------------
How much power should people have in european politics?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Henrik Ibsen never dared to ring himself free from moral illusionism which says freedom, and will not admit, even to itself, what freedom is.
The question, why should I care?
----------------------------------------------------------------------------------------------------
Can religion save the evil in the world?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Kant seems to have needed the hypothesis of intelligible freedom, in order to relieve the ens perfectum of
We have a long time to go back to the days when the world was full of evil, evil and evil, but I think it is time for the world to go back to its old days.
But what is it that makes it so beautiful?
----------------------------------------------------------------------------------------------------
What is the meaning of Love, to live in a more peaceful world?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Nihilism will have to manifest itself as a psychological condition, first when we have sought in all that has happened a purpose which is not there. Nihilism is therefore the coming into consciousness of the long waste of strength, the pain of futility, uncertainty.
The question: Why are you so passionate about the idea of a new world?
----------------------------------------------------------------------------------------------------
Is Nihilism an alternative to hope?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


The Nihilist supposes that the sight of such a desolate, useless Being is unsatisfying to the philosopher, and fills him with desolation and despair. This aspect of the case is opposed to our subtle sensibilities as a philosopher.
This is the first time that the world has been transformed into a new world of mythological and mythological religions.
What is the meaning of the word myth?
----------------------------------------------------------------------------------------------------
If children grow up in this World, how can we teach them how to live more sustainably?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


We must ask ourselves whence we derived our belief in these three categories. Let us see if it is possible to refuse to believe in them.
The first question is: What is the meaning of the word, and how can we know that the meaning of the word itself is in the same order as the meaning of the word itself?
