# Crime_and_Punishment_summarization



In [42]:
# installation required
# ! pip install langchain
# ! pip install unstructured
# ! pip install pdf2image
# ! pip install pdfminer
# ! pip install pdfminer.six
# ! pip install markdown
# ! pip install openai
# ! pip install reportlab
# !pip install pypdf

# OpenAI API Setup

In [32]:
import configparser
import openai

# Create a ConfigParser object
config = configparser.ConfigParser()

# Read the config.ini file
config.read('config.ini')

my_openai_key = config['OPENAI']['OPENAI_API_KEY']
openai.api_key = my_openai_key
os.environ['OPENAI_API_KEY'] = my_openai_key

# Data Loading

In [43]:
from langchain.document_loaders import PyPDFLoader
import re
import os
import glob
import markdown
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

In [4]:
loader = PyPDFLoader("/content/crime-and-punishment.pdf")
data = loader.load_and_split()

In [5]:
data[2]

Document(page_content='\x18 Free eBooks at Planet eBook.com(that ‘stern and just man,’ as Maurice Baring calls him) this \nwas enough, and he was condemned to death. After eight \nmonths’ imprisonment he was with twenty-one others tak -\nen out to the Semyonovsky Square to be shot. Writing to his \nbrother Mihail, Dostoevsky says: ‘They snapped words over \nour heads, and they made us put on the white shirts worn by \npersons condemned to death. Thereupon we were bound in \nthrees to stakes, to suffer execution. Being the third in the \nrow, I concluded I had only a few minutes of life before me. \nI thought of you and your dear ones and I contrived to kiss \nPlestcheiev and Dourov, who were next to me, and to bid \nthem farewell. Suddenly the troops beat a tattoo, we were \nunbound, brought back upon the scaffold, and informed \nthat his Majesty had spared us our lives.’ The sentence was \ncommuted to hard labour.\nOne of the prisoners, Grigoryev, went mad as soon as he \nwas untied, 

In [6]:
data[0].dict().keys()

dict_keys(['page_content', 'metadata', 'type'])

In [7]:
data[1].dict()["page_content"][:1000]

'Crime and Punishment \x18Translator’s Preface\nA few words about Dostoevsky himself may help the Eng -\nlish reader to understand his work.\nDostoevsky was the son of a doctor. His parents were \nvery hard- working and deeply religious people, but so poor \nthat they lived with their five children in only two rooms. \nThe father and mother spent their evenings in reading aloud \nto their children, generally from books of a serious charac -\nter.\nThough always sickly and delicate Dostoevsky came out \nthird in the final examination of the Petersburg school of \nEngineering. There he had already begun his first work, \n‘Poor Folk.’\nThis story was published by the poet Nekrassov in his \nreview and was received with acclamations. The shy, un -\nknown youth found himself instantly something of a \ncelebrity. A brilliant and successful career seemed to open \nbefore him, but those hopes were soon dashed. In 1849 he \nwas arrested.\nThough neither by temperament nor conviction a revolu -\

In [8]:
def get_full_text_from_documents(documents):
    full_text = ''

    for document in documents:
        # Extract the 'page_content' from each Document
        page_text = document.dict()["page_content"]
        full_text += page_text + '\n\n'  # Add a newline for separation between pages

    return full_text

In [9]:
full_data= get_full_text_from_documents(data)
print("Total characters: {}".format(len(full_data)))

Total characters: 1173038


# Data Preprocessing

In [11]:
# Function to clean and preprocess text
def preprocess_text(text):
    # Define patterns to be removed and their replacements
    patterns_to_remove = [
        (r'\n\d+\x18\x18\n', ''),
        (r'\nFree eBooks at Planet eBook.com\n', ''),
        (r'\nCrime and Punishment\n', ''),
        (r'\n\x18\x18\x18\n', ''),
        (r'\n\x18', ''),
        (r'\n\x180', ''),
        (r'\n\x181', ''),
        (r'\n\x18\x18', ''),
        (r'\n\x18\n', ' ')
    ]

    # Apply each pattern removal
    for pattern, replacement in patterns_to_remove:
        text = re.sub(pattern, replacement, text)

    # Removing extra line breaks
    text = text.replace('\n\n', '\n')

    return text

In [12]:
def detect_part_boundaries(text):
    # Manually identified markers for the start of each part
    part_markers = {
        "Translator’s Preface": "A few words about Dostoevsky",
        "Part I": "On an exceptionally hot evening",
        "Part II": "So he lay a very long while",
        "Part III": "Raskolnikov got up, and sat down on the sofa",
        "Part IV": "Can this be still a dream?",
        "Part V": "The morning that followed the fateful",
        "Part VI": "A strange period began for Raskolnikov",
        # Add markers for all parts
    }

    parts = []
    for part, start_marker in part_markers.items():
        start_index = text.find(start_marker)
        end_index = text.find(next(iter(part_markers.values()), ""), start_index + 1)
        if end_index == -1:  # If it's the last part
            end_index = len(text)

        part_text = text[start_index:end_index].strip()
        parts.append((part, part_text))

    return parts

In [13]:
def detect_chapter_boundaries(part_text):
    # Adjust the regex pattern to match the chapter headings in your text
    chapter_pattern = r'\bChapter\s+[IVXLCDM]+\b'

    # Split the part text into chapters using the chapter pattern
    potential_chapters = re.split(chapter_pattern, part_text)

    # Filter out any empty or irrelevant splits and remove any leading or trailing whitespace
    chapters = [chapter.strip() for chapter in potential_chapters if chapter.strip()]

    return chapters




In [14]:
# Preprocess the text
processed_text = preprocess_text(full_data)

In [15]:
# Detect parts
parts = detect_part_boundaries(processed_text)

In [None]:
# parts[0]

In [16]:
# Function to find a specific part by title
def get_part_content(parts, part_title):
    for title, content in parts:
        if title == part_title:
            return content
    return "Part not found."



# Print a snippet of Part IV content
print( get_part_content(parts, "Part III")[:1000])

Raskolnikov got up, and sat down on the sofa. He waved 
his hand weakly to Razumihin to cut short the flow of 
warm and incoherent consolations he was addressing to his 
mother and sister, took them both by the hand and for a 
minute or two gazed from one to the other without speak -
ing. His mother was alarmed by his expression. It revealed 
an emotion agonisingly poignant, and at the same time 
something immovable, almost insane. Pulcheria Alexan -
drovna began to cry.
Avdotya Romanovna was pale; her hand trembled in her 
brother’s.
‘Go home … with him,’ he said in a broken voice, point -
ing to Razumihin, ‘good-bye till to-morrow; to-morrow 
everything … Is it long since you arrived?’
‘This evening, Rodya,’ answered Pulcheria Alexandrov -
na, ‘the train was awfully late. But, Rodya, nothing would 
induce me to leave you now! I will spend the night here, 
near you …’
‘Don’t torture me!’ he said with a gesture of irritation.
‘I will stay with him,’ cried Razumihin, ‘I won’t leave him 

In [17]:
def extract_chapters_from_part(part_text):
    chapters = detect_chapter_boundaries(part_text)
    return chapters

In [18]:
part_i_content = get_part_content(parts, "Part III")

# Extract chapters from Part I
chapters_in_part_i = extract_chapters_from_part(part_i_content)

# Print each chapter's beginning for verification
for i, chapter in enumerate(chapters_in_part_i, start=1):
    print(f"Chapter {i} beginning snippet:\n{chapter[:200]}\n")

Chapter 1 beginning snippet:
Raskolnikov got up, and sat down on the sofa. He waved 
his hand weakly to Razumihin to cut short the flow of 
warm and incoherent consolations he was addressing to his 
mother and sister, took them b

Chapter 2 beginning snippet:
‘He is well, quite well!’ Zossimov cried cheerfully as 
they entered.
He had come in ten minutes earlier and was sitting in the 
same place as before, on the sofa. Raskolnikov was sitting in 
the oppo

Chapter 3 beginning snippet:
At that moment the door was softly opened, and a young 
girl walked into the room, looking timidly about her. 
Everyone turned towards her with surprise and curiosity. 
At first sight, Raskolnikov did

Chapter 4 beginning snippet:
It was nearly eight o’clock. The two young men hurried to 
Bakaleyev’s, to arrive before Luzhin.
‘Why, who was that?’ asked Razumihin, as soon as they 
were in the street.
‘It was Svidrigaïlov, that l

Chapter 5 beginning snippet:
The fact was that up to the last moment he had 

In [19]:
# Assuming you have a list 'parts' containing tuples of (part_title, part_text)
for part_title, part_text in parts:
    print(f"Processing {part_title}")

    chapters = detect_chapter_boundaries(part_text)
    for i, chapter in enumerate(chapters, start=1):
        print(f"Chapter {i} beginning snippet in {part_title}:\n{chapter[:200]}\n")


Processing Translator’s Preface
Chapter 1 beginning snippet in Translator’s Preface:
A few words about Dostoevsky himself may help the Eng -
lish reader to understand his work.
Dostoevsky was the son of a doctor. His parents were 
very hard- working and deeply religious people, but so

Chapter 2 beginning snippet in Translator’s Preface:
He waked up late next day after a broken sleep. But his 
sleep had not refreshed him; he waked up bilious, ir -
ritable, ill-tempered, and looked with hatred at his room. 
It was a tiny cupboard of a 

Chapter 3 beginning snippet in Translator’s Preface:
Later on Raskolnikov happened to find out why the huck -
ster and his wife had invited Lizaveta. It was a very 
ordinary matter and there was nothing exceptional about 
it. A family who had come to th

Chapter 4 beginning snippet in Translator’s Preface:
The door was as before opened a tiny crack, and again 
two sharp and suspicious eyes stared at him out of the 
darkness. Then Raskolnikov lost his hea

# Formatting and Saving in correct format

In [21]:
def create_markdown_file(parts):
    for part_num, (part_heading, part_text) in enumerate(parts):
        # Special handling for the Translator’s Preface
        if 'Translator’s Preface' in part_heading:
            part_filename = f'translator_preface.md'
            markdown_heading = f"# {part_heading}\n\n"
        else:
            # Extract the part number from the part heading (e.g., "Part I" => "I")
            part_match = re.search(r'Part ([IVXLCDM]+)\n', part_heading)
            part_number = part_match.group(1) if part_match else str(part_num)
            part_filename = f'part_{part_number}.md'
            markdown_heading = f"# Part {part_number}\n\n"

        # Create a Markdown file for the part
        with open(part_filename, 'w', encoding='utf-8') as part_file:
            part_file.write(markdown_heading)

            # Detect chapter boundaries within the part
            chapters = detect_chapter_boundaries(part_text)

            for i, chapter_text in enumerate(chapters, start=1):
                # Create a Markdown heading for each chapter within the part
                chapter_heading = f"## Chapter {i}\n\n"
                part_file.write(chapter_heading)
                part_file.write(chapter_text)
                part_file.write('\n\n')  # Add an extra newline for separation between chapters

# Example usage
create_markdown_file(parts)


# Splitting

In [22]:
markdown_directory = os.getcwd()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)


splits = []

for markdown_file in glob.glob(os.path.join(markdown_directory, "*.md")):
    # Read the Markdown file
    print(markdown_file.split('\\')[-1].split('.')[0].replace("_", ' '))
    with open(markdown_file, 'r', encoding='utf-8') as file:
        markdown_text = file.read()
    md_header_splits = markdown_splitter.split_text(markdown_text)

    for split in md_header_splits:
        splits.append(split)

/content/part 2
/content/translator preface
/content/part 4
/content/part 3
/content/part 1
/content/part 5
/content/part 6


In [23]:
print("Total Documents: {}".format(len(splits)))

Total Documents: 57


In [None]:
splits[5]

Document(page_content='‘I don’t believe it, I can’t believe it!’ repeated Razumihin,\ntrying in perplexity to refute Raskolnikov’s arguments.\nThey were by now approaching Bakaleyev’s lodgings,\nwhere Pulcheria Alexandrovna and Dounia had been ex -\npecting them a long while. Razumihin kept stopping on the\nway in the heat of discussion, confused and excited by the\nvery fact that they were for the first time speaking openly\nabout it.\n‘Don’t believe it, then!’ answered Raskolnikov, with a\ncold, careless smile. ‘You were noticing nothing as usual,\nbut I was weighing every word.’\n‘You are suspicious. That is why you weighed their words\n… h’m … certainly, I agree, Porfiry’s tone was rather strange,\nand still more that wretch Zametov! … You are right, there\nwas something about him—but why? Why?’\n‘He has changed his mind since last night.’\n‘Quite the contrary! If they had that brainless idea, they\nwould do their utmost to hide it, and conceal their cards, so\nas to catch you afte

In [24]:
len(splits[1].dict()["page_content"])

75929

In [25]:
splits[0].dict()["metadata"]

{'Header 1': 'Part 2', 'Header 2': 'Chapter 1'}

# Prompt, LLM, Chain, Callback

In [33]:

# Define prompt
prompt_template = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

# Define LLM chain
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(
    llm_chain=llm_chain, document_variable_name="text"
)

In [37]:
import time

In [41]:
summarize_document = {}

def insert_summary(part_no, chapter_no, summary):
    if part_no not in summarize_document:
        summarize_document[part_no] = {}
    summarize_document[part_no][chapter_no] = summary

count = 1
cost = 0

for split_doc in splits:
    with get_openai_callback() as cb:
        print(count)
        print(split_doc.dict()["metadata"]['Header 1'])
        print(split_doc.dict()["metadata"]['Header 2'])
        partNo = split_doc.dict()["metadata"]['Header 1']
        chapterNo = split_doc.dict()["metadata"]['Header 2']
        summary = stuff_chain.run([split_doc])
        time.sleep(10)
        print(cb)
        cost += cb.total_cost

        insert_summary(partNo, chapterNo, summary)

    count += 1


In [None]:
with get_openai_callback() as cb:
        print(splits[-1].dict()["metadata"]['Header 1'])
        partNo = splits[-1].dict()["metadata"]['Header 1']
        summary = stuff_chain.run([splits[-1]])
        print(cb)
        cost += cb.total_cost

        insert_summary(partNo, " ", summary)

Part 5
Tokens Used: 14550
	Prompt Tokens: 14402
	Completion Tokens: 148
Successful Requests: 1
Total Cost (USD): $0.043798000000000004


In [None]:
print(f"Total Cost (USD): ${cost}")

Total Cost (USD): $0.9695350000000001


# Summarized Document (Dictionary)

In [None]:
summarize_document

{'Part I': {'Chapter 1': 'On a hot evening in July, a young man leaves his garret and walks towards a bridge, avoiding his landlady. He is in debt to her and afraid of meeting her. The young man is in an irritable and isolated state, and fears interacting with anyone. He is absorbed in himself and has given up on practical matters. He is going to attempt something, but is unsure and frightened. The heat and unpleasant surroundings exacerbate his already strained nerves. He enters a tavern and drinks beer, feeling a temporary relief from his worries. However, he senses that his newfound cheerfulness is not normal. The tavern is mostly empty, with a few drunk men and a suspicious-looking man.',
  'Chapter 2': "Raskolnikov, who usually avoids crowds, suddenly feels a desire to be around people. He goes to a tavern and notices a retired clerk who seems interested in talking to him. The clerk, Marmeladov, tells Raskolnikov about his struggles with poverty and his wife's mistreatment. Marmel

In [None]:
import json
# Specify the file path where you want to save the JSON data
json_file_path = "summarize_document.json"

# Convert the dictionary to JSON and save it to the file
with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(summarize_document, json_file, ensure_ascii=False, indent=4)

# Print a message indicating that the data has been saved
print(f"JSON data saved to {json_file_path}")

JSON data saved to summarize_document.json


In [None]:
# Reorder the dictionary by moving "Translator’s Preface" to the beginning
reordered_data = {
    "Translator’s Preface": summarize_document["Translator’s Preface"],
    **{key: summarize_document[key] for key in summarize_document if key != "Translator’s Preface"}
}

In [None]:
reordered_data

{'Translator’s Preface': {'Chapter 1': "The Translator's Preface provides a brief overview of Fyodor Dostoevsky's life and experiences. It mentions his upbringing in a poor but religious family, his early success as a writer, his arrest and near-execution, his time in prison and exile, his struggles with epilepsy, his financial difficulties, and his eventual recognition as a beloved and influential writer in Russia. The summary highlights Dostoevsky's ability to deeply understand and convey the human experience through his writing."},
 'Part 2': {'Chapter 1': 'Part II of the document discusses the availability of free eBooks on Planet eBook.com.',
  'Chapter 2': 'The protagonist, Raskolnikov, wakes up after a long period of unconsciousness and realizes that it is already morning. He is startled by the sound of drunken men outside and suddenly remembers everything that happened the previous night. He frantically searches his clothes for any evidence of his crime, but finds nothing excep

# PDF Generation of Summary

In [None]:

# Create a PDF document
pdf_file_path = "summarize_document.pdf"
doc = SimpleDocTemplate(pdf_file_path, pagesize=letter)

story = []

# Define padding values
padding_left = 50  # Adjust the left padding
padding_right = 50  # Adjust the right padding
padding_top = 50  # Adjust the top padding
padding_bottom = 50  # Adjust the bottom padding

# Define a style for the title
title_style = getSampleStyleSheet()["Title"]
title_style.fontName = "Helvetica-Bold"
title_style.alignment = 1  # Centered
title_style.fontSize = 42  # Increase the font size for the title
title_style.leading = 56  # Increase the line spacing (leading) for the title

# Define a style for the top-level section titles (Heading1)
styles = getSampleStyleSheet()
heading1_style = styles["Heading1"]
heading1_style.fontSize = 36  # Increase the font size for Heading1
heading1_style.leading = 48  # Increase the line spacing (leading) for the title

heading2_style = styles["Heading2"]
heading2_style.fontSize = 28  # Increase the font size for Heading1
heading2_style.leading = 36  # Increase the line spacing (leading) for the title

# Define a style for the paragraphs
style = styles["Normal"]
style.fontSize = 14  # Increase the font size for the paragraphs
style.leading = 20  # Increase the line spacing (leading) for the title

# Add the title to the PDF
title_text = "Summary: Crime and Punishment"
title = Paragraph(title_text, title_style)
story.append(title)

# Add spacing above the title
story.append(Spacer(1, padding_top))

# Iterate through the JSON data and add the "Translator’s Preface," parts, chapters, and summaries to the PDF
for section, chapters in reordered_data.items():
    # Add the section title to the PDF
    section_title = Paragraph(section, heading1_style)  # Use the Heading1 style for top-level titles
    story.append(section_title)

    for chapter, summary in chapters.items():
        # Skip the empty chapter title for the "Translator’s Preface"
        if chapter:
            # Add the chapter title to the PDF
            chapter_title = Paragraph(chapter, heading2_style)
            story.append(chapter_title)

        # Add the chapter summary to the PDF
        chapter_summary = Paragraph(summary, style)
        story.append(chapter_summary)

        # Add space between chapters
    story.append(Spacer(1, padding_bottom + 20))  # Add spacing below each chapter

# Build the PDF document
doc.build(story)

print(f"PDF created: {pdf_file_path}")


PDF created: summarize_document.pdf
