In [4]:
# ALL IMPORTS

# -----------------------------------
# 1
# SpaCy IMPORTS
# -----------------------------------
!pip install -U spacy

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
!python -m spacy download en_core_web_lg

# Check how many stopwords
#len(STOP_WORDS)

# Create a spaCy nlp object
nlp = spacy.load('en_core_web_lg')

# Check if a word is a stopword:
#nlp.vocab['thy'].is_stop

# Add custom stopwords
new_stop_words = ['thy', 'ye', 'thee', 'thou', 'll', 've']
for w in new_stop_words :
  STOP_WORDS.add(w)

# -----------------------------------
# 2
# TRANSFORMERS IMPORTS
#!pip install transformers datasets
!pip install --no-cache-dir transformers datasets sentencepiece

from transformers import (pipeline, 
                          AutoModel, 
                          AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          AutoModelForCausalLM, 
                          DataCollatorForLanguageModeling, 
                          TrainingArguments, 
                          Trainer,
)

from datasets import load_dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-11-16 11:25:10.738536: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 16 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import os
!ls

nietzsche.txt  sample_data


# Topic Extractor + Summarization

## Load data / text preprocessing

In [6]:
# -----------------------------------
# 3
# Load list of paragraphs
import os

def txt2paragraph(filepath):
    with open(filepath) as f:
        lines = f.readlines()

    paragraph = ''
    for line in lines:
        if line.isspace():  # is it an empty line?
            if paragraph:
                yield paragraph
                paragraph = ''
            else:
                continue
        else:
            paragraph += ' ' + line.strip()
    yield paragraph

# -----------------------------------
# 4

# Build the list of raw paragraphs
raw_paragraphs = []
for par in txt2paragraph('nietzsche.txt'):
  par = par.strip()
  raw_paragraphs.append(par)

# -----------------------------------
# 5
# Basic preprocessing
import re
def paragraphPreprocess(raw_paragraphs: list):
  paragraphs = []
  prev = ''
  for par in raw_paragraphs:
    # First we exclude short paragraphs and Footnotes
    if (len(par) < 200 and not prev) or "Footnote" in par or 'NOTE' in par or 'Nietzsche' in par: 
      continue 
    # Next remove non-alpha characters at the beginning of each paragraph
    else:
      for c in par:
        if c.isalpha():
          i = par.find(c)
          par = par[i:]
          break
      par = re.sub('[—]', ' ', par)
      par = par.replace('-', ' ')
      par = par.replace('”', '')
      par = par.replace("’", '')
      # Remove text between square brackets: 
      # "[\(\[].*?[\)\]]"  is a REGEX for finding
      # the pattern for brackets containing some content
      par = re.sub("[\(\[].*?[\)\]]","", par)
      par = re.sub('[_\'{}()…="]', '', par)
      par = prev + ' ' + par
      par = par.strip()
      if par[-1] in [':', ';', ','] or par[-1].isalpha():
        prev = par
        continue
      else:
        paragraphs.append(par)
        prev = ''
  return paragraphs

paragraphs = paragraphPreprocess(raw_paragraphs)

# -----------------------------------
# 6

# Create a Pandas DataFrame out of our list of paragraphs
import pandas as pd
df = pd.DataFrame(paragraphs, columns =['paragraph'])


# ----------------------------------------------------
# 7
### Tokenization and further preprocessing with SpaCy

# Split text into a list of SENTENCES with SpaCy
def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]

# -----------------------------------
# 8

# Further preprocessing with SpaCy
import string
def clean_text(text):
    '''
    Make text lowercase, remove text in square brackets, 
    remove punctuation and remove words containing numbers.
    '''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df_clean = pd.DataFrame(df.paragraph.apply(lambda x: clean_text(x)))

# -----------------------------------
# 9

def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)
    
df["paragraph_lemmatize"] =  df_clean.apply(lambda x: lemmatizer(x['paragraph']), axis=1)
#df.head()

## Topic extraction: Non-negative Matrix Factorization

In [7]:
# -----------------------------------
# 10
# Create a DOCUMENT TERM MATRIX
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=3, stop_words=STOP_WORDS)

dtm = tfidf.fit_transform(df['paragraph_lemmatize'])


# -----------------------------------
# 11
# Create an instance of NMF with n_comp components
from sklearn.decomposition import NMF
n_comp = 8
nmf_model = NMF(n_components=n_comp,random_state=42)
nmf_model.fit(dtm)

# Print the most common words for each topic
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-20:]])
    print('\n')

# -----------------------------------
# 12


# ASSIGN Topic to paragraphs and COUNT paragraphs/topic

topic_results = nmf_model.transform(dtm)
df['topic'] = topic_results.argmax(axis=1)

df.groupby(['topic']).size()

#df.head(10)
#df[df['topic'] == topic].head() # Filter df by topic



THE TOP WORDS FOR TOPIC #0
['pain', 'animal', 'high', 'time', 'feel', 'long', 'cause', 'woman', 'power', 'nature', 'self', 'soul', 'strong', 'bad', 'thing', 'know', 'great', 'virtue', 'good', 'man']


THE TOP WORDS FOR TOPIC #1
['eye', 'silent', 'stand', 'night', 'cry', 'look', 'long', 'laugh', 'speak', 'day', 'mountain', 'cave', 'unto', 'hath', 'like', 'hear', 'heart', 'come', 'spake', 'zarathustra']


THE TOP WORDS FOR TOPIC #2
['word', 'day', 'culture', 'europe', 'wagners', 'artist', 'great', 'book', 'like', 'let', 'germany', 'people', 'spirit', 'art', 'taste', 'musician', 'germans', 'music', 'german', 'wagner']


THE TOP WORDS FOR TOPIC #3
['appearance', 'form', 'hero', 'æsthetic', 'euripide', 'artistic', 'picture', 'dream', 'nature', 'phenomenon', 'chorus', 'myth', 'tragic', 'greek', 'world', 'music', 'apollonian', 'tragedy', 'art', 'dionysian']


THE TOP WORDS FOR TOPIC #4
['jewish', 'punishment', 'faith', 'gospel', 'act', 'hate', 'child', 'concept', 'evil', 'thing', 'holy', 'sha

topic
0    604
1    316
2    215
3    209
4    198
5    354
6    112
7    400
dtype: int64

## Text to Paragraph

In [8]:
# -----------------------------------
# 13
def textToParagraph(text):
  """
  Input: text - a string of text
  Output: Full Paragraph from the DataFrame that best matches with input text

  Description: 
  text -> predict Topic -> compute similarity ONLY with paragraphs in this Topic
       -> return paragraph with highest similarity score
  """

  # PREDICT Topic
  text = lemmatizer(clean_text(text))
  X = tfidf.transform([text]) # transform the TF-IDF
  nmf_features = nmf_model.transform(X) # get the nmf_features (score) vector
  topic = nmf_features.argmax()


  # Compute SIMILARITY with paragraphs in this Topic

  # Similarity function: (text, paragraph)
  def sim(text, par):
    # Clean text, remove stopwords and tokenize
    doc = nlp(lemmatizer(clean_text(par)))
    return nlp(text).similarity(doc)

  # Get series of similarity scores on the DataFrame sliced by topic
  scores = df[df['topic'] == topic].apply(lambda x: sim(text, x['paragraph']), axis=1)

  # Return most similar paragraph
  id = scores.idxmax() # Get id of the max score
  return df['paragraph'][id]

# Summarization

In [9]:
# -----------------------------------
# 14
# Load summarization pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_1_name = "facebook/bart-large-cnn"
tokenizer_1 = AutoTokenizer.from_pretrained(model_1_name)
model_1 = AutoModelForSeq2SeqLM.from_pretrained(model_1_name)
  
summarizer = pipeline("summarization", model=model_1_name, tokenizer=tokenizer_1)

# -----------------------------------
# 15

# TRUNCATE string to TWO SENTENCES
def firstTwoSentences(s: str):
  """
  Input: string s
  Returns: first two sentences in s
  """
  # Truncate string at the last '.' or '?' or '!'
  reverse = s[::-1]
  LastFullStop = reverse.find(".")
  LastQmark = reverse.find("?")
  LastXmark = reverse.find("!")
  lastStopSymbol = max(LastFullStop, LastQmark, LastXmark)

  if lastStopSymbol >= 0 :
    s = s[: len(s) - lastStopSymbol]

  # Return the first two sentences
  return ' '.join(split_in_sentences(s)[:2])

# -----------------------------------
# 16

# SUMMARY snippet
def summary(paragraph: str):
  """
  Input: paragraph - a string
  Output: summary - a string. The summary of the paragraph.
  Parameters: we set min_length to 10% of the paragraph's length and max_length to 40% respectively
  """
  min_length=len(tokenizer_1(paragraph)['input_ids']) // 10
  max_length= 5*min_length
  summary = summarizer(paragraph, min_length=min_length, max_length=max_length)[0]['summary_text']
  return firstTwoSentences(summary)

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [10]:
# -----------------------------------
# 17
# Print text by sentences
def printBySentence(text: str):
  for sent in split_in_sentences(text):
    print(sent)
  return

# TEST SUMMARIZER with a few paragraphs
import random
for step in range(3):
  t = random.randint(0, len(df['paragraph']))
  paragraph = df['paragraph'][t]
  print("PARAGRAPH:")
  printBySentence(paragraph)

  print("\nSUMMARY:")
  printBySentence(summary(paragraph))
  print(100*'-')

PARAGRAPH:
I will now pass just one or two general remarks about my art of style.
To communicate a state an inner tension of pathos by means of signs, including the tempo of these signs,  that is the meaning of every style; and in view of the fact that the multiplicity of inner states in me is enormous, I am capable of many kinds of style  in short, the most multifarious art of style that any man has ever had at his disposal.
Any style is good which genuinely communicates an inner condition, which does not blunder over the signs, over the tempo of the signs, or over moods  all the laws of phrasing are the outcome of representing moods artistically.
Good style, in itself, is a piece of sheer foolery, mere idealism, like beauty in itself, for instance, or goodness in itself, or the thing in itself.
All this takes for granted, of course, that there exist ears that can hear, and such men as are capable and worthy of a like pathos, that those are not wanting unto whom one may communicate on

# Text to (relevant) paragraph

In [11]:
# -----------------------------------
# 18
# TEST

testQuestion = []
testQuestion.append('How much power should people have in european politics?')
testQuestion.append('Can religion save the evil in the world?')
testQuestion.append('What is the meaning of Love, to live in a more peaceful world?')
testQuestion.append('Is Nihilism an alternative to hope?')

for text in testQuestion:
  print(100*'-')
  print("Input: ", text, '\n') 
  paragraph = textToParagraph(text)
  printBySentence(summary(paragraph))

----------------------------------------------------------------------------------------------------
Input:  How much power should people have in european politics? 

Protestantism is something incomplete and complexly valuable for knowledge.
----------------------------------------------------------------------------------------------------
Input:  Can religion save the evil in the world? 

According to the teaching of history, the consciousness of owing debts to the deity by no means came to an end with the decay of the clan organisation of society.
The appearance of the Christian god, as the record god up to this time, has for that very reason brought equally into the world the record amount of guilt consciousness.
----------------------------------------------------------------------------------------------------
Input:  What is the meaning of Love, to live in a more peaceful world? 

What concerns me is the psychological type of the Saviour.
This type might be depicted in the Gosp

# Question Generator

In [12]:
# -----------------------------------
# 19
# Load the TOKENIZER
model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
# -----------------------------------
# 20
# GENERATE QUESTION WITH FINE-TUNED MODEL

# Load the fine-tuned model from local (a file pytorch_model.bin must be in the current directory)
model = AutoModelForCausalLM.from_pretrained("./")

# Snippet to TRUNCATE strings to QUESTION mark
def questionTruncate(s: str):
  """
  String polisher for outputting clean questions.
  Input: string s
  Returns: string s truncated at the FIRST "?" char or at the LAST "." char
  """
  reverse = s[::-1]
  LastFullStop = reverse.find(".")
  FirstQuestMark = s.find("?")
  if FirstQuestMark >= 0 :
    return s[: FirstQuestMark +1 ]
  elif LastFullStop >= 0 :
    return s[: len(s) - LastFullStop]
  return s

# Test
#for s in ['Multiple? more than one?', 'First? Then no question.', 'No punctuation', 'No question.']:
#  print(questionTruncate(s))

# -----------------------------------
# 21
# QUESTION GENERATOR

def questionGenerator(text: str):
  """
  Input:
  Returns:
  """
  # ENCODE input and add 'end-of-string' token
  input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
  l = len(input_ids)
  # GENERATE
  chat_history_ids = model.generate(
      input_ids,
      max_length=l+80,
      do_sample=True,
      top_p=0.91,
      top_k=10,
      temperature=0.75,
      early_stopping=True, #####
      pad_token_id=tokenizer.eos_token_id
  )

  # DECODE to string
  output = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
  return output

In [16]:
# -----------------------------------
# 22
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
# -----------------------------------
# 23
# Test
for text in ['What does Nihilism mean?', 'What is Pessimism', 'What is the purpose?', 'Prejudice against science?']:
  print(f"Text: {text} \n\nBot: {questionGenerator(text)} \n", 100*'-')

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What does Nihilism mean? 

Bot: It is a matter of time for us to find out what is true of the term, what is the meaning of the word?The meaning of the word is a question that I am always asked: Why should we call ourselves a Christian?What is the meaning of the word?It is a question of necessity, of necessity, of self-sacrifice, of 
 ----------------------------------------------------------------------------------------------------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What is Pessimism 

Bot: The world is changing, and what is it that makes the world better?Why is it that the world is changing?Because, as a matter of fact, it is the most beautiful thing ever. But what is it?The question is why do we call it the art of the art of the art of the spectator. What is it that is most beautiful?A question 
 ----------------------------------------------------------------------------------------------------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What is the purpose? 

Bot: This is my first book on the philosophy of the philosopher, and why is it that he was born, the philosopher of the world?In the name of a good feeling, I mean, what?To understand why we should not ask ourselves the question: What does it mean to be a philosopher?In the name of the philosopher of the world, it is understood as a 
 ----------------------------------------------------------------------------------------------------
Text: Prejudice against science? 

Bot: In the first chapter of the book of the German Revolution, I have to say that the German Revolution of 1793-1793 is a period of great pain and a profound and profound psychological distress, a period of profound and profound suffering. To the extent that the German Revolution of 1793-1793 was a period of profound and profound suffering, the German Revolution of 
 ----------------------------------------------------------------------------------------------------


# PhilosopherBot

In [18]:
# -----------------------------------
# 24

"""
MAIN LOGIC IS AS FOLLOWS:
input -> topic -> relevant paragraph 
-> use last sentence of paragraph to generate question 
-> print(question, paragraph, summarization of previous two)
"""

def bot(text):
  paragraph = textToParagraph(text)
  summarized_paragraph = summary(paragraph)

  print(summarized_paragraph)

  lastSentence = split_in_sentences(summarized_paragraph)[-1]
  question = questionGenerator(lastSentence)
  question = questionTruncate(question)
  for sent in split_in_sentences(question):
    print(sent)
  return

In [19]:
# -----------------------------------
# 25

testQuestion = []
text = 'Tell me about God and the meaning of Life'
for step in range(5):
  testQuestion.append(text)
  text = questionGenerator(text)

for text in testQuestion:
  print("Input text:\n" + text)
  print("Bot: \n")
  bot(text)
  print(100 * '-')

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Input text:
Tell me about God and the meaning of Life
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


"I found that the good man was a form of self affirmation on the part of decadence"
What is the meaning of this word?
----------------------------------------------------------------------------------------------------
Input text:
It is the highest-sounding question of life: Who has the right to have children? What does it mean to be an artist? And what is the meaning of being a man?The question arises from the question: what is the meaning of being a man?What is the meaning of being a man?In order to understand what is the meaning of
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


"A race of masters is either paramount or else it goes to the dogs," writes
When I first asked the question, I had always wondered if the term, which means to say, the Greek word for the soul, is the word for the soul, or of course, the word for the soul?
----------------------------------------------------------------------------------------------------
Input text:
When I was a child I was a child
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


"How is it possible that I can feel so free, so happy?
The following is the most important question in the history of the Christian faith.
Why is it that I believe in the word Christian?
----------------------------------------------------------------------------------------------------
Input text:
What are the consequences of having to endure the temptation to have sex with your children?...The question is: what is the meaning of having sex with your children?...What is the meaning of having to endure the temptation to have sex with your children?...The question is: What is the meaning of having to endure the temptation to have sex with your
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


The utility of being agreed concerning superior values has attained in this respect to a sort of sanction. We observe that every care is taken to paralyse reflection and criticism in this department.
I love to be a philosopher, but why would we not believe it?
----------------------------------------------------------------------------------------------------
Input text:
A woman who has been abused for over a
Bot: 



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


A single joyless person is enough to make constant displeasure and a clouded heaven in a household.
The question of what happens when the world is full of new and unexpected problems?
----------------------------------------------------------------------------------------------------


In [20]:
# -----------------------------------
# 26

testQuestion = []
testQuestion.append('How much power should people have in european politics?')
testQuestion.append('Can religion save the evil in the world?')
testQuestion.append('What is the meaning of Love, to live in a more peaceful world?')
testQuestion.append('Is Nihilism an alternative to hope?')
testQuestion.append('If children grow up in this World, how can we teach them how to live more sustainably?')

for text in testQuestion:
  print(100 * '-')  
  print(text + '\n')
  bot(text)


----------------------------------------------------------------------------------------------------
How much power should people have in european politics?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Protestantism is something incomplete and complexly valuable for knowledge.
A.
The word "foolish" is a term for a certain type of problem: Who is the cause of the problem?
----------------------------------------------------------------------------------------------------
Can religion save the evil in the world?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


According to the teaching of history, the consciousness of owing debts to the deity by no means came to an end with the decay of the clan organisation of society. The appearance of the Christian god, as the record god up to this time, has for that very reason brought equally into the world the record amount of guilt consciousness.
What is the meaning of a philosopher who makes a profound and profound mistake?
----------------------------------------------------------------------------------------------------
What is the meaning of Love, to live in a more peaceful world?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


What concerns me is the psychological type of the Saviour. This type might be depicted in the Gospels, in however mutilated a form and however much overladen with extraneous characters.
The first question is: Who is it that is most important to me?
----------------------------------------------------------------------------------------------------
Is Nihilism an alternative to hope?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Pessimism is not a problem but a symptom,  that the term ought to be replaced by Nihilism.
It was just a matter of time before I had to write about it.
I had to put it here, but I still have to wait for it to be put to the test.
Why is it that the world is changing?
----------------------------------------------------------------------------------------------------
If children grow up in this World, how can we teach them how to live more sustainably?



A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Stoicism is self-tyranny, says the author. The Stoics want to dictate their morals and ideals to Nature, to Nature herself, and to incorporate them therein.
In the case of the modern day, I am not sure who is the most important person to me, but you might ask yourself this question: what is it that is called the Greek word for the good?
