In [4]:
import pandas as pd 
import numpy as np 


In [6]:
txt_file = pd.read_csv('/home/vishnu/IQLearn/Thermodynamics.txt', header=None, delim_whitespace=True)
#code for read .txt file


In [7]:
txt_file

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,152,153,154,155,156,157,158,159,160,161
0,Thermodynamics:,Fundamental,concepts,and,definitions,",",various,systems,",",intensive,...,property,changes,on,mixing,",",heat,effects,of,mixing,processes.
1,##,ðŸ”¸,A.,Fundamental,Concepts,of,Thermodynamics,,,,...,,,,,,,,,,
2,1.,Thermodynamics,is,the,science,that,deals,with,,,...,,,,,,,,,,
3,A.,Motion,of,fluids,,,,,,,...,,,,,,,,,,
4,B.,"Energy,","heat,","work,",and,their,interconversion,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,A.,Fluid,mechanics,,,,,,,,...,,,,,,,,,,
1246,B.,Chemical,equilibrium,calculations,,,,,,,...,,,,,,,,,,
1247,C.,Heat,exchanger,design,,,,,,,...,,,,,,,,,,
1248,D.,Structural,analysis,,,,,,,,...,,,,,,,,,,


In [9]:
# Robust ways to read a .txt file and parse blank-line-separated entries into questions
from pathlib import Path
import pandas as pd


def read_text_file_lines(path, encoding='utf-8'):
    """Read a text file and return list of raw lines (preserves order).
    Uses errors='replace' to avoid crashes on unknown bytes.
    """
    p = Path(path)
    with p.open('r', encoding=encoding, errors='replace') as f:
        return [line.rstrip('\n') for line in f]


def split_entries_by_blank_line(lines):
    """Group consecutive non-empty lines into entries separated by blank lines.
    Returns list of entries where each entry is a single string (lines joined with a space).
    """
    entries = []
    buffer = []
    for ln in lines:
        if ln.strip():
            buffer.append(ln.strip())
        else:
            if buffer:
                entries.append(' '.join(buffer))
                buffer = []
    # final buffer
    if buffer:
        entries.append(' '.join(buffer))
    return entries


# Example usage: adjust this path to your file
file_path = '/home/vishnu/IQLearn/Thermodynamics.txt'
lines = read_text_file_lines(file_path)
# quick preview
print(f"Read {len(lines)} lines from {file_path}")

# clean lines (remove fully-empty lines)
clean_lines = [ln for ln in lines]

# Option A: treat each non-empty line as a record
df_lines = pd.DataFrame({'text': [ln for ln in clean_lines if ln.strip()]})
print('\nDataFrame treating each non-empty line as a row:')
print(df_lines.head())

# Option B: treat blank-line-separated blocks as individual questions/entries
entries = split_entries_by_blank_line(lines)
df_entries = pd.DataFrame({'question': entries})
print(f"\nFound {len(entries)} entries (blocks separated by blank lines):")
print(df_entries.head())

# Save to CSV/JSON if desired
out_csv = '/home/vishnu/IQLearn/thermo_questions.csv'
out_json = '/home/vishnu/IQLearn/thermo_questions.json'
df_entries.to_csv(out_csv, index=False)
df_entries.to_json(out_json, orient='records', force_ascii=False)
print(f"Saved entries to:\n  {out_csv}\n  {out_json}")

# Minimal sanity checks
assert len(df_entries) > 0, 'No entries found â€” check input file and separators.'


Read 2092 lines from /home/vishnu/IQLearn/Thermodynamics.txt

DataFrame treating each non-empty line as a row:
                                                text
0  Thermodynamics: Fundamental concepts and defin...
1     ## ðŸ”¸ A. Fundamental Concepts of Thermodynamics
2   1. Thermodynamics is the science that deals with
3                                A. Motion of fluids
4     B. Energy, heat, work, and their interconve...

Found 448 entries (blocks separated by blank lines):
                                            question
0  Thermodynamics: Fundamental concepts and defin...
1     ## ðŸ”¸ A. Fundamental Concepts of Thermodynamics
2  1. Thermodynamics is the science that deals wi...
3                                             Ans: B
4  2. The term system in thermodynamics refers to...
Saved entries to:
  /home/vishnu/IQLearn/thermo_questions.csv
  /home/vishnu/IQLearn/thermo_questions.json


In [17]:
# Convert questions to formatted .txt with continuous numbering
import re
from pathlib import Path


def reformat_questions_with_continuous_numbering(input_text, output_path=None):
    """
    Reformat question text with continuous numbering.
    Preserves topic line and all content, but re-numbers questions sequentially.
    
    Args:
        input_text: Raw question text (can include multiple topics or question blocks)
        output_path: Path to save formatted output (if None, just returns string)
    
    Returns:
        Formatted text with continuous numbering
    """
    lines = input_text.strip().split('\n')
    output_lines = []
    question_num = 1
    i = 0
    
    while i < len(lines):
        line = lines[i]
        
        # Preserve topic or header lines (lines starting with "Topic:" or "Chapter:" etc.)
        if line.strip().startswith('Topic:') or line.strip().startswith('Chapter:'):
            output_lines.append(line)
            i += 1
            continue
        
        # Match question lines like "1. Question text", "2. Question text", etc.
        match = re.match(r'^\s*\d+\.\s+(.+)$', line)
        if match:
            # Replace the number with continuous numbering
            question_text = match.group(1)
            output_lines.append(f"{question_num}. {question_text}")
            question_num += 1
        else:
            # Keep all other lines (options A, B, C, D, Answer, blank lines, etc.)
            output_lines.append(line)
        
        i += 1
    
    formatted_text = '\n'.join(output_lines)
    
    # Save to file if path provided
    if output_path:
        p = Path(output_path)
        p.parent.mkdir(parents=True, exist_ok=True)
        with p.open('w', encoding='utf-8') as f:
            f.write(formatted_text)
        print(f"âœ“ Saved formatted questions to: {output_path}")
    
    return formatted_text


# Read the Thermodynamics.txt file
input_file = '/home/vishnu/IQLearn/ChemicalReactionEngineering.txt'
with open(input_file, 'r', encoding='utf-8', errors='replace') as f:
    thermodynamics_text = f.read()

print(f"Read {len(thermodynamics_text)} characters from {input_file}")
print(f"\n--- Original Content Preview (first 500 chars) ---")
print(thermodynamics_text[:500])

# Reformat with continuous numbering and save
output_file = '/home/vishnu/IQLearn/ChemicalReactionEngineering_new.txt'
formatted = reformat_questions_with_continuous_numbering(thermodynamics_text, output_file)

print(f"\n--- Formatted Output Preview (first 500 chars) ---")
print(formatted[:500])

print(f"\nâœ“ Task complete! Formatted file saved to: {output_file}")


Read 26300 characters from /home/vishnu/IQLearn/ChemicalReactionEngineering.txt

--- Original Content Preview (first 500 chars) ---
Chemical Reaction Engineering: Overview of chemical reaction engineering, classification of chemical reactions, variables affecting the rate of reaction, definition of reaction rate, kinetics of homogeneous reaction, pseudo steady state hypothesis (PSSH), searching for a mechanism, General considerations, hydrogen bromide reaction polymerisation ,steps in free radical polymerisation, evaluation of rate equation by integral and differential analysis for co.nstant volume and variable volume system
âœ“ Saved formatted questions to: /home/vishnu/IQLearn/ChemicalReactionEngineering_new.txt

--- Formatted Output Preview (first 500 chars) ---
Chemical Reaction Engineering: Overview of chemical reaction engineering, classification of chemical reactions, variables affecting the rate of reaction, definition of reaction rate, kinetics of homogeneous reaction, pseudo 

In [None]:
"definitionofinstrumentation_new.txt"
"Thermodynamics_new.txt"
"Chemicalreactionequilibria_new.txt"
"HeterogeneousReactions_new.txt"
"ProcessCalculations_new.txt"
"ChemicalReactionEngineering_new.txt"