# For colab (no web)

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class BookRecommender:
    def __init__(self):
        self.books = []  #list of dicts: {'title': ..., 'author': ..., 'text': ..., 'filepath': ...}
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))
        self.tfidf_matrix = None
        self.similarity_matrix = None

    def load_books_from_folder(self, folder_path):
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"Folder '{folder_path}' not found.")

        pattern = re.compile(r"^(.*?)\s+by\s+(.*?)(?:\.txt)?$", re.IGNORECASE)
        txt_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.txt')]

        if not txt_files:
            print("No .txt files found in the folder.")
            return

        for filename in txt_files:
            filepath = os.path.join(folder_path, filename)
            match = pattern.match(os.path.splitext(filename)[0])

            if not match:
                print(f"Skipping file (incorrect name format): {filename}")
                continue

            title = match.group(1).strip()
            author = match.group(2).strip()

            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()
                words = text.split()
                text = ' '.join(words[:10000])  #use first 10k words (optional)

                self.books.append({
                    'title': title,
                    'author': author,
                    'text': text,
                    'filepath': filepath
                })
                print(f"Loaded: '{title}' by {author}")
            except Exception as e:
                print(f"Error reading {filename}: {e}")

        if not self.books:
            raise ValueError("No valid books loaded. Check file format and encoding.")

    def build_similarity_matrix(self):
        #fit TF-IDF and compute cosine similarity matrix
        if not self.books:
            raise ValueError("No books loaded. Call load_books_from_folder() first.")

        texts = [book['text'] for book in self.books]
        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

        print(f"Similarity matrix built for {len(self.books)} books.")

    def recommend_books(self, title_query, top_n=5):
        titles = [book['title'].lower() for book in self.books]
        query_lower = title_query.strip().lower()

        if query_lower not in titles:
            matches = [t for t in titles if query_lower in t]
            if matches:
                print(f"Did you mean: '{[self.books[titles.index(m)]['title'] for m in matches][0]}'?")
            raise ValueError(f"Book titled '{title_query}' not found. Available titles: {list(set(t['title'] for t in self.books))}")

        idx = titles.index(query_lower)
        sim_scores = self.similarity_matrix[idx]

        #sort by similarity descending, exclude self
        similar_indices = np.argsort(sim_scores)[::-1][1:top_n + 1]

        recommendations = []
        for i in similar_indices:
            recommendations.append({
                'title': self.books[i]['title'],
                'author': self.books[i]['author'],
                'similarity': float(sim_scores[i])
            })

        return recommendations

    def list_all_books(self):
        return [(book['title'], book['author']) for book in self.books]

In [33]:
if __name__ == "__main__":
    recommender = BookRecommender()

    BOOKS_FOLDER = "/content/drive/MyDrive/test"

    try:
        recommender.load_books_from_folder(BOOKS_FOLDER)

        print("\nüìö Books loaded:\n")
        for title, author in recommender.list_all_books():
            print(f"  ‚Ä¢ {title} by {author}")
        recommender.build_similarity_matrix()

        title = "The Scarlet Lette" # @param {type:"string"}
        print(f"\nüîé Recommending books similar to '{test_title}':\n")
        recs = recommender.recommend_books(title, top_n=5)

        for i, rec in enumerate(recs, 1):
            print(f"{i}. {rec['title']} by {rec['author']} ‚Äî Similarity: {rec['similarity']:.3f}")

    except Exception as e:
        print(f"Error: {e}")

Loaded: 'Hamlet' by William Shakespeare
Loaded: 'Hamlet' by William Shakespeare
Loaded: 'Moby Dick; Or, The Whale' by Herman Melville
Loaded: 'Moby Dick; Or, The Whale' by Herman Melville
Loaded: 'Romeo and Juliet' by William Shakespeare
Loaded: 'Romeo and Juliet' by William Shakespeare
Loaded: 'The Scarlet Letter' by Nathaniel Hawthorne
Loaded: 'The Scarlet Letter' by Nathaniel Hawthorne
Loaded: 'Alice's Adventures in Wonderland' by Lewis Carroll
Loaded: 'The Strange Case of Dr Jekyll and Mr Hyde' by Robert Louis Stevenson
Loaded: 'Alice's Adventures in Wonderland' by Lewis Carroll
Loaded: 'The Strange Case of Dr Jekyll and Mr Hyde' by Robert Louis StevensonLoaded: 'Frankenstein; Or, The Modern Prometheus' by Mary Wollstonecraft Shelley

Loaded: 'Frankenstein; Or, The Modern Prometheus' by Mary Wollstonecraft ShelleyLoaded: 'Crime and Punishment' by Fyodor Dostoyevsky

Loaded: 'Crime and Punishment' by Fyodor Dostoyevsky
Loaded: 'The Idiot' by Fyodor Dostoyevsky
Loaded: 'The Idiot' by

In [34]:
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, render_template_string
import threading
import subprocess
import time

class BookRecommender:
    def __init__(self):
        self.books = []
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))
        self.tfidf_matrix = None
        self.similarity_matrix = None

    def load_books_from_folder(self, folder_path):
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"Folder '{folder_path}' not found.")
        pattern = re.compile(r"^(.*?)\s+by\s+(.*?)(?:\.txt)?$", re.IGNORECASE)
        txt_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.txt')]
        if not txt_files:
            print("No .txt files found in the folder.")
            return
        for filename in txt_files:
            filepath = os.path.join(folder_path, filename)
            match = pattern.match(os.path.splitext(filename)[0])
            if not match:
                print(f"Skipping file (incorrect name format): {filename}")
                continue
            title = match.group(1).strip()
            author = match.group(2).strip()
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()
                words = text.split()
                text = ' '.join(words[:10000])
                self.books.append({
                    'title': title,
                    'author': author,
                    'text': text,
                    'filepath': filepath
                })
                print(f"Loaded: '{title}' by {author}")
            except Exception as e:
                print(f"Error reading {filename}: {e}")
        if not self.books:
            raise ValueError("No valid books loaded. Check file format and encoding.")

    def build_similarity_matrix(self):
        if not self.books:
            raise ValueError("No books loaded. Call load_books_from_folder() first.")
        texts = [book['text'] for book in self.books]
        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)
        print(f"Similarity matrix built for {len(self.books)} books.")

    def recommend_books(self, title_query, top_n=5):
        titles = [book['title'].lower() for book in self.books]
        query_lower = title_query.strip().lower()
        if query_lower not in titles:
            matches = [t for t in titles if query_lower in t]
            if matches:
                idx = titles.index(matches[0])
                suggestion = f"{self.books[idx]['title']} by {self.books[idx]['author']}"
                raise ValueError(f"Did you mean: {suggestion}?")
            raise ValueError("Book not found.")
        idx = titles.index(query_lower)
        sim_scores = self.similarity_matrix[idx]
        similar_indices = np.argsort(sim_scores)[::-1][1:top_n + 1]
        recommendations = []
        for i in similar_indices:
            recommendations.append({
                'title': self.books[i]['title'],
                'author': self.books[i]['author'],
                'similarity': float(sim_scores[i])
            })
        return recommendations

    def list_all_books(self):
        return [(book['title'], book['author']) for book in self.books]

recommender = BookRecommender()
BOOKS_FOLDER = "/content/drive/MyDrive/test"

try:
    recommender.load_books_from_folder(BOOKS_FOLDER)
    recommender.build_similarity_matrix()
except Exception as e:
    print(f"Error loading books: {e}")

num_books = len(recommender.books)

app = Flask(__name__)

HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8" />
  <title>TF-IDF Book Recommender</title>
  <link href="https://fonts.googleapis.com/css2?family=Montserrat:wght@400;700&display=swap" rel="stylesheet">
  <style>
    body {
      font-family: 'Montserrat', sans-serif;
      background: white;
      color: black;
      margin: 0;
      padding: 0;
      display: flex;
      justify-content: center;
      align-items: center;
      min-height: 100vh;
      flex-direction: column;
    }
    .container {
      text-align: center;
      width: 600px;
      max-width: 90vw;
    }
    h1 {
      font-size: 36px;
      margin-bottom: 20px;
      font-weight: 700;
    }
    .search-form {
      margin: 20px 0;
    }
    input[type="text"] {
      padding: 15px;
      width: 100%;
      max-width: 500px;
      font-size: 16px;
      border: 1px solid #ddd;
      border-radius: 24px;
      outline: none;
      box-shadow: 0 2px 6px rgba(0,0,0,0.1);
      transition: box-shadow 0.3s;
    }
    input[type="text"]:focus {
      box-shadow: 0 4px 10px rgba(0,0,0,0.15);
    }
    button {
      display: none; /* Hide submit button visually but keep form functional */
    }
    .info {
      font-size: 16px;
      margin-top: 10px;
      color: #444;
    }
    .error {
      color: #d32f2f;
      margin-top: 10px;
      font-style: italic;
    }
    .results {
      margin-top: 30px;
      width: 100%;
      border-collapse: collapse;
    }
    .results td, .results th {
      padding: 12px 10px;
      text-align: left;
      border: none;
    }
    .results tr:nth-child(even) {
      background-color: transparent;
    }
    .results th {
      font-weight: 700;
      color: #000;
    }
  </style>
</head>
<body>
  <div class="container">
    <h1>TF-IDF Book Recommender</h1>

    <form method="post" class="search-form">
      <input type="text" name="title" placeholder="Search for a book..." value="{{ request.form.title if request.method == 'POST' }}" autocomplete="off">
    </form>

    {% if results %}
      <p class="info">Number of books in database: {{ num_books }}</p>
      <table class="results">
        <thead>
          <tr>
            <th>#</th>
            <th>Title</th>
            <th>Author</th>
            <th>Similarity</th>
          </tr>
        </thead>
        <tbody>
          {% for rec in results %}
            <tr>
              <td>{{ loop.index }}</td>
              <td>{{ rec.title }}</td>
              <td>{{ rec.author }}</td>
              <td>{{ "%.3f" % rec.similarity }}</td>
            </tr>
          {% endfor %}
        </tbody>
      </table>
    {% elif error %}
      <p class="error">{{ error }}</p>
    {% else %}
      <p class="info">Number of books in database: {{ num_books }}</p>
    {% endif %}
  </div>
</body>
</html>
'''

@app.route("/", methods=["GET", "POST"])
def index():
    results = None
    error = None
    if request.method == "POST":
        title_input = request.form["title"].strip()
        if title_input:
            try:
                results = recommender.recommend_books(title_input, top_n=5)
            except Exception as e:
                error = str(e)
        else:
            error = "Please enter a book title."
    return render_template_string(HTML_TEMPLATE,
                                num_books=num_books,
                                results=results,
                                error=error,
                                request=request)

def run_app():
    app.run(host="0.0.0.0", port=5000)

threading.Thread(target=run_app, daemon=True).start()

print("Installing localtunnel...")
os.system("npm install -g localtunnel")

print("Starting localtunnel...")
tunnel_proc = subprocess.Popen(["lt", "--port", "5000"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
time.sleep(3)

public_url = None
try:
    public_url = tunnel_proc.stdout.readline().strip()
    if not public_url:
        public_url = tunnel_proc.stderr.readline().strip()
except:
    pass

if public_url and "https://" in public_url:
    print(f"\nüåü Public URL: {public_url}")
else:
    lines = []
    for _ in range(10):
        line = tunnel_proc.stdout.readline().strip()
        if line:
            lines.append(line)
    for line in lines:
        if "https://" in line:
            print(f"\nüåê Public URL: {line}")
            print(f"\nüåê Password:")
            break
    else:
        print("\n‚ùå Could not retrieve public URL.")

Loaded: 'The Idiot' by Fyodor Dostoyevsky
Loaded: 'War and Peace' by Leo Tolstoy
Loaded: 'Anna Karenina' by Leo Tolstoy
Loaded: 'Treasure Island' by Robert Louis Stevenson
Loaded: 'Dead Souls' by Nikolai Gogol
Loaded: 'Oblomov' by Ivan Goncharov
Loaded: 'Hamlet' by William Shakespeare
Loaded: 'The Seven Who Were Hanged' by Leonid Andreyev
Loaded: 'Moby Dick; Or, The Whale' by Herman Melville
Loaded: 'Romeo and Juliet' by William Shakespeare
Loaded: 'The Scarlet Letter' by Nathaniel Hawthorne
Loaded: 'Alice's Adventures in Wonderland' by Lewis Carroll
Loaded: 'The Strange Case of Dr Jekyll and Mr Hyde' by Robert Louis Stevenson
Loaded: 'Frankenstein; Or, The Modern Prometheus' by Mary Wollstonecraft Shelley
Loaded: 'Crime and Punishment' by Fyodor Dostoyevsky
Loaded: 'The Idiot' by Fyodor Dostoyevsky
Loaded: 'War and Peace' by Leo Tolstoy
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded: 'Anna Karenin

Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.


Loaded: 'War and Peace' by Leo Tolstoy
Loaded: 'Hamlet' by William Shakespeare
Loaded: 'Moby Dick; Or, The Whale' by Herman MelvilleLoaded: 'Anna Karenina' by Leo Tolstoy

Loaded: 'Romeo and Juliet' by William Shakespeare
Loaded: 'Treasure Island' by Robert Louis Stevenson
Loaded: 'The Scarlet Letter' by Nathaniel Hawthorne
Loaded: 'Dead Souls' by Nikolai Gogol
Loaded: 'Alice's Adventures in Wonderland' by Lewis Carroll
Loaded: 'Oblomov' by Ivan Goncharov
Loaded: 'The Strange Case of Dr Jekyll and Mr Hyde' by Robert Louis Stevenson
Loaded: 'The Seven Who Were Hanged' by Leonid Andreyev
Loaded: 'Frankenstein; Or, The Modern Prometheus' by Mary Wollstonecraft Shelley
Loaded: 'Crime and Punishment' by Fyodor Dostoyevsky
Loaded: 'Hamlet' by William Shakespeare
Loaded: 'The Idiot' by Fyodor Dostoyevsky
Loaded: 'Moby Dick; Or, The Whale' by Herman Melville
Loaded: 'Romeo and Juliet' by William Shakespeare
Loaded: 'War and Peace' by Leo Tolstoy
Loaded: 'The Scarlet Letter' by Nathaniel Hawtho

In [35]:
!curl https://loca.lt/mytunnelpassword

Loaded: 'Hamlet' by William Shakespeare
Loaded: 'Moby Dick; Or, The Whale' by Herman Melville
Loaded: 'Romeo and Juliet' by William Shakespeare
Loaded: 'The Scarlet Letter' by Nathaniel Hawthorne
Loaded: 'Alice's Adventures in Wonderland' by Lewis Carroll
Loaded: 'The Strange Case of Dr Jekyll and Mr Hyde' by Robert Louis Stevenson
Loaded: 'Frankenstein; Or, The Modern Prometheus' by Mary Wollstonecraft Shelley
Loaded: 'Crime and Punishment' by Fyodor Dostoyevsky
34.74.120.135

# Dynamic + QR

In [38]:
with open('/content/drive/MyDrive/tf-idf/stop-words-russian.txt', 'r') as file:
    russian_stop_words = [line.strip() for line in file]

print(russian_stop_words)

['\ufeff–∞', '–µ', '–∏', '–∂', '–º', '–æ', '–Ω–∞', '–Ω–µ', '–Ω–∏', '–æ–±', '–Ω–æ', '–æ–Ω', '–º–Ω–µ', '–º–æ–∏', '–º–æ–∂', '–æ–Ω–∞', '–æ–Ω–∏', '–æ–Ω–æ', '–º–Ω–æ–π', '–º–Ω–æ–≥–æ', '–º–Ω–æ–≥–æ—á–∏—Å–ª–µ–Ω–Ω–æ–µ', '–º–Ω–æ–≥–æ—á–∏—Å–ª–µ–Ω–Ω–∞—è', '–º–Ω–æ–≥–æ—á–∏—Å–ª–µ–Ω–Ω—ã–µ', '–º–Ω–æ–≥–æ—á–∏—Å–ª–µ–Ω–Ω—ã–π', '–º–Ω–æ—é', '–º–æ–π', '–º–æ–≥', '–º–æ–≥—É—Ç', '–º–æ–∂–Ω–æ', '–º–æ–∂–µ—Ç', '–º–æ–∂—Ö–æ', '–º–æ—Ä', '–º–æ—è', '–º–æ—ë', '–º–æ—á—å', '–Ω–∞–¥', '–Ω–µ–µ', '–æ–±–∞', '–Ω–∞–º', '–Ω–µ–º', '–Ω–∞–º–∏', '–Ω–∏–º–∏', '–º–∏–º–æ', '–Ω–µ–º–Ω–æ–≥–æ', '–æ–¥–Ω–æ–π', '–æ–¥–Ω–æ–≥–æ', '–º–µ–Ω–µ–µ', '–æ–¥–Ω–∞–∂–¥—ã', '–æ–¥–Ω–∞–∫–æ', '–º–µ–Ω—è', '–Ω–µ–º—É', '–º–µ–Ω—å—à–µ', '–Ω–µ–π', '–Ω–∞–≤–µ—Ä—Ö—É', '–Ω–µ–≥–æ', '–Ω–∏–∂–µ', '–º–∞–ª–æ', '–Ω–∞–¥–æ', '–æ–¥–∏–Ω', '–æ–¥–∏–Ω–Ω–∞–¥—Ü–∞—Ç—å', '–æ–¥–∏–Ω–Ω–∞–¥—Ü–∞—Ç—ã–π', '–Ω–∞–∑–∞–¥', '–Ω–∞–∏–±–æ–ª–µ–µ', '–Ω–µ–¥–∞–≤–Ω–æ', '–º–∏–ª–ª–∏–æ–Ω–æ–≤', '–Ω–µ–¥–∞–ª–µ–∫–æ', '–º–µ–∂–¥—É', '–Ω–∏–∑–∫–æ', '–º–µ–ª—è', '–Ω–µ–ª—å–∑—è', '–Ω–∏–±—É–¥—å', '–Ω–µ–ø—Ä–µ—Ä—ã–≤–Ω–æ', '–Ω–∞–∫–æ–Ω

In [39]:
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from flask import Flask, request, render_template_string
import threading
import subprocess
import time
import socket
from urllib.parse import quote

class BookRecommender:
    def __init__(self):
        self.books = []
        self.loaded_filenames = set()
        self.vectorizer = TfidfVectorizer(stop_words=russian_stop_words, max_features=10000, ngram_range=(1, 2))
        self.tfidf_matrix = None
        self.similarity_matrix = None
        self.lock = threading.Lock()

    def load_books_from_folder(self, folder_path):
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"Folder '{folder_path}' not found.")

        pattern = re.compile(r"^(.*?)\s+by\s+(.*?)(?:\.txt)?$", re.IGNORECASE)
        current_txt_files = {f for f in os.listdir(folder_path) if f.lower().endswith('.txt')}
        previous_filenames = set(self.loaded_filenames)

        # Detect removed files
        removed_files = previous_filenames - current_txt_files
        new_books_found = False
        books_removed = False

        # Remove books from collection if their file was deleted
        if removed_files:
            with self.lock:
                # Filter out books that came from removed files
                self.books = [book for book in self.books if os.path.basename(book['filepath']) not in removed_files]
                # Update loaded filenames
                self.loaded_filenames = {f for f in self.loaded_filenames if f not in removed_files}
                print(f"üóëÔ∏è Removed {len(removed_files)} books (files deleted): {removed_files}")
                books_removed = True

        # Now process new or updated files
        for filename in current_txt_files:
            if filename in self.loaded_filenames:
                continue  # Already loaded

            filepath = os.path.join(folder_path, filename)
            match = pattern.match(os.path.splitext(filename)[0])
            if not match:
                print(f"Skipping file (incorrect name format): {filename}")
                self.loaded_filenames.add(filename)
                continue

            title = match.group(1).strip()
            author = match.group(2).strip()
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()
                words = text.split()
                text = ' '.join(words[:10000])  # Limit size
                book_data = {
                    'title': title,
                    'author': author,
                    'text': text,
                    'filepath': filepath
                }
                with self.lock:
                    self.books.append(book_data)
                    self.loaded_filenames.add(filename)
                print(f"‚úÖ Loaded new book: '{title}' by {author}")
                new_books_found = True
            except Exception as e:
                print(f"Error reading {filename}: {e}")
                self.loaded_filenames.add(filename)

        # Rebuild similarity matrix only if something changed
        if new_books_found or books_removed:
            print("üîÑ Rebuilding similarity matrix due to added/removed books...")
            self.build_similarity_matrix()

    def build_similarity_matrix(self):
        with self.lock:
            if not self.books:
                return
            texts = [book['text'] for book in self.books]
            self.tfidf_matrix = self.vectorizer.fit_transform(texts)
            self.similarity_matrix = cosine_similarity(self.tfidf_matrix)
            print(f"üìä Similarity matrix updated for {len(self.books)} books.")

    def recommend_books(self, title_query, top_n=5):
        with self.lock:
            titles = [book['title'].lower() for book in self.books]
            query_lower = title_query.strip().lower()
            if query_lower not in titles:
                matches = [t for t in titles if query_lower in t]
                if matches:
                    idx = titles.index(matches[0])
                    suggestion = f"{self.books[idx]['title']} by {self.books[idx]['author']}"
                    raise ValueError(f"Did you mean: {suggestion}?")
                raise ValueError("Book not found.")
            idx = titles.index(query_lower)
            sim_scores = self.similarity_matrix[idx]
            similar_indices = np.argsort(sim_scores)[::-1][1:top_n + 1]
            recommendations = []
            for i in similar_indices:
                recommendations.append({
                    'title': self.books[i]['title'],
                    'author': self.books[i]['author'],
                    'similarity': float(sim_scores[i])
                })
            return recommendations

    def list_all_books(self):
        return [(book['title'], book['author']) for book in self.books]

recommender = BookRecommender()
BOOKS_FOLDER = "/content/drive/MyDrive/tf-idf/txt"

try:
    print("üîç Loading initial books...")
    recommender.load_books_from_folder(BOOKS_FOLDER)
except Exception as e:
    print(f"Error loading initial books: {e}")

num_books = len(recommender.books)

def poll_for_new_books():
    while True:
        time.sleep(5)
        try:
            recommender.load_books_from_folder(BOOKS_FOLDER)
        except Exception as e:
            print(f"Error during polling: {e}")

threading.Thread(target=poll_for_new_books, daemon=True).start()


app = Flask(__name__)

HTML_TEMPLATE = '''
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8" />
  <title>TF-IDF Book Recommender</title>
  <link href="https://fonts.googleapis.com/css2?family=Montserrat:wght@400;700&display=swap" rel="stylesheet">
  <style>
    body {
      font-family: 'Montserrat', sans-serif;
      background: white;
      color: black;
      margin: 0;
      padding: 0;
      display: flex;
      justify-content: center;
      align-items: center;
      min-height: 100vh;
      flex-direction: column;
    }
    .container {
      text-align: center;
      width: 600px;
      max-width: 90vw;
      padding: 20px;
    }
    h1 {
      font-size: 36px;
      margin-bottom: 20px;
      font-weight: 700;
    }
    .search-form {
      margin: 20px 0;
    }
    input[type="text"] {
      padding: 15px;
      width: 100%;
      max-width: 500px;
      font-size: 16px;
      border: 1px solid #ddd;
      border-radius: 24px;
      outline: none;
      box-shadow: 0 2px 6px rgba(0,0,0,0.1);
      transition: box-shadow 0.3s;
    }
    input[type="text"]:focus {
      box-shadow: 0 4px 10px rgba(0,0,0,0.15);
    }
    button {
      display: none;
    }
    .info {
      font-size: 16px;
      margin-top: 10px;
      color: #444;
    }
    .error {
      color: #d32f2f;
      margin-top: 10px;
      font-style: italic;
    }
    .results {
      margin-top: 30px;
      width: 100%;
      border-collapse: collapse;
    }
    .results td, .results th {
      padding: 12px 10px;
      text-align: left;
      border: none;
    }
    .results tr:nth-child(even) {
      background-color: rgba(0, 0, 0, 0.03);
    }
    .results th {
      font-weight: 700;
      color: #000;
      background-color: transparent;
    }
    .results tbody tr:hover {
      background-color: rgba(0, 0, 0, 0.05);
    }
  </style>
</head>
<body>

  <!-- Main Content -->
  <div class="container">
    <h1>TF-IDF Book Recommender</h1>
    <form method="post" class="search-form">
      <input
        type="text"
        name="title"
        placeholder="Search for a book..."
        value="{{ request.form.title if request.method == 'POST' }}"
        autocomplete="off"
        aria-label="Search for a book"
      >
    </form>
    <p class="info">Number of books in database: <strong><span id="bookCount">{{ num_books }}</span></strong></p>

    {% if results %}
      <table class="results">
        <thead>
          <tr>
            <th>#</th>
            <th>Title</th>
            <th>Author</th>
            <th>Similarity</th>
          </tr>
        </thead>
        <tbody>
          {% for rec in results %}
            <tr>
              <td>{{ loop.index }}</td>
              <td>{{ rec.title }}</td>
              <td>{{ rec.author }}</td>
              <td>{{ "%.3f" % rec.similarity }}</td>
            </tr>
          {% endfor %}
        </tbody>
      </table>
    {% elif error %}
      <p class="error">{{ error }}</p>
    {% else %}
      <p class="info">Enter a book title to get recommendations.</p>
    {% endif %}
  </div>

  <!-- QR Code Overlay (Bottom Right Corner) -->
  <div id="qr-overlay" style="
    position: fixed;
    bottom: 20px;
    right: 20px;
    background: white;
    border: 1px solid #ddd;
    border-radius: 12px;
    padding: 12px;
    box-shadow: 0 4px 16px rgba(0,0,0,0.15);
    width: 180px;
    text-align: center;
    font-family: 'Montserrat', sans-serif;
    z-index: 1000;
    font-size: 12px;
  ">
    <div id="qrcode" style="display: inline-block;"></div>
    <p style="margin: 8px 0 0; font-size: 18px; color: #444;">
      password: {{ password_ip }}
    </p>
  </div>

  <!-- Load QR Code Generator (Client-Side) -->
  <script src="https://cdnjs.cloudflare.com/ajax/libs/qrcodejs/1.0.0/qrcode.min.js"></script>
  <script>
    // Generate QR code for public URL
    new QRCode(document.getElementById("qrcode"), {
      text: "{{ public_url }}",
      width: 160,
      height: 160,
      colorDark: "#000",
      colorLight: "#fff"
    });
  </script>

  <!-- Auto-update Book Count -->
  <script>
    function updateBookCount() {
      fetch('/api/book_count')
        .then(response => {
          if (!response.ok) throw new Error("Network error");
          return response.json();
        })
        .then(data => {
          const countSpan = document.getElementById('bookCount');
          countSpan.textContent = data.count;
        })
        .catch(err => {
          // Silently fail
        });
    }

    updateBookCount();
    setInterval(updateBookCount, 2000);
  </script>

</body>
</html>
'''

@app.route("/", methods=["GET", "POST"])
def index():
    with recommender.lock:
        current_num_books = len(recommender.books)
    results = None
    error = None
    if request.method == "POST":
        title_input = request.form["title"].strip()
        if title_input:
            try:
                results = recommender.recommend_books(title_input, top_n=5)
            except Exception as e:
                error = str(e)
        else:
            error = "Please enter a book title."

    return render_template_string(
        HTML_TEMPLATE,
        num_books=current_num_books,
        results=results,
        error=error,
        request=request,
        public_url=public_url,
        password_ip=password_ip
    )

@app.route("/api/book_count")
def book_count():
    with recommender.lock:
        count = len(recommender.books)
    return {"count": count}

def run_app():
    app.run(host="0.0.0.0", port=5000)

threading.Thread(target=run_app, daemon=True).start()


print("üì¶ Installing localtunnel...")
os.system("npm install -g localtunnel")

print("üîÅ Starting localtunnel...")
tunnel_proc = subprocess.Popen(["lt", "--port", "5000"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
time.sleep(3)

public_url = None
try:
    output_line = tunnel_proc.stdout.readline().strip()
    if not output_line or "https://" not in output_line:
        output_line = tunnel_proc.stderr.readline().strip()
    if "https://" in output_line:
        public_url = output_line
except:
    pass

if not public_url:
    for _ in range(10):
        line = tunnel_proc.stdout.readline().strip()
        if "https://" in line:
            public_url = line
            break

if public_url:
    public_url = public_url.split(" ")[-1].strip()  # Extract clean URL
    print(f"\nüåü Public URL: {public_url}")
else:
    print("\n‚ùå Could not retrieve public URL.")
    public_url = "https://example.com"

import requests

def get_external_ip():
    try:
        return requests.get("https://api.ipify.org", timeout=5).text.strip()
    except:
        return "unable-to-get-ip"

password_ip = get_external_ip()
print(f"üîê Access Password (IP): {password_ip}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üîç Loading initial books...
‚úÖ Loaded new book: '–°–∞—Ö–∞—Ä–Ω—ã–∏ÃÜ –ö—Ä–µ–º–ª—å' by –í–ª–∞–¥–∏–º–∏—Ä –°–æ—Ä–æ–∫–∏–Ω
‚úÖ Loaded new book: '–£–±–∏–∏ÃÜ—Å—Ç–≤–æ –Ω–∞ —É–ª–∏—Ü–µ –ú–æ—Ä–≥' by –≠–¥–≥–∞—Ä –ê–ª–ª–∞–Ω –ü–æ
‚úÖ Loaded new book: '–ù–æ—Ä–º–∞' by –í–ª–∞–¥–∏–º–∏—Ä –°–æ—Ä–æ–∫–∏–Ω
‚úÖ Loaded new book: '–ü—Ä–µ–≤—Ä–∞—â–µ–Ω–∏–µ' by –§—Ä–∞–Ω—Ü –ö–∞—Ñ–∫–∞
‚úÖ Loaded new book: '–ë–µ–¥–Ω–∞—è –õ–∏–∑–∞' by –ù–∏–∫–æ–ª–∞–∏ÃÜ –ö–∞—Ä–∞–º–∑–∏–Ω
‚úÖ Loaded new book: '–ú–æ—Å–∫–≤–∞-–ü–µ—Ç—É—à–∫–∏' by –í–µ–Ω–µ–¥–∏–∫—Ç –ï—Ä–æ—Ñ–µ–µ–≤
‚úÖ Loaded new book: '–ï–≤–≥–µ–Ω–∏–∏ÃÜ –û–Ω–µ–≥–∏–Ω' by –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω
‚úÖ Loaded new book: '–û–±–ª–æ–º–æ–≤' by –ò–≤–∞–Ω –ì–æ–Ω—á–∞—Ä–æ–≤
‚úÖ Loaded new book: '–¢–∞—Ä–∞—Å –ë—É–ª—å–±–∞' by –ù–∏–∫–æ–ª–∞–∏ÃÜ –ì–æ–≥–æ–ª—å
‚úÖ Loaded new book: '–§–∞—É—Å—Ç' by –ò–æ–≥–∞–Ω–Ω –í–æ–ª—å—Ñ–≥–∞–Ω–≥ –ì–µÃà—Ç–µ
‚úÖ Loaded new book: '–¢–µ–Ω—å 

Address already in use
Port 5000 is in use by another program. Either identify and stop that program, or start the server with a different port.


üîÅ Starting localtunnel...

üåü Public URL: https://chilly-crabs-trade.loca.lt
üîê Access Password (IP): 34.74.120.135


# Book autodownload + cleanup

In [None]:
from google.colab import drive
import os
import requests
import zipfile
import shutil
import re
import time
import chardet

def read_with_encoding(filepath):
    with open(filepath, 'rb') as f:
        raw = f.read()
        result = chardet.detect(raw)
        encoding = result['encoding']
    with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
        return f.read()

drive.mount('/content/drive')

base_dir = '/content/drive/MyDrive/tf-idf'
zip_dir = os.path.join(base_dir, 'zip')
txt_dir = os.path.join(base_dir, 'txt')

os.makedirs(zip_dir, exist_ok=True)
os.makedirs(txt_dir, exist_ok=True)

books_by_author = {
    "–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω": ["–ö–∞–ø–∏—Ç–∞–Ω—Å–∫–∞—è –¥–æ—á–∫–∞", "–î—É–±—Ä–æ–≤—Å–∫–∏–π", "–ï–≤–≥–µ–Ω–∏–π –û–Ω–µ–≥–∏–Ω"],
    "–ù–∏–∫–æ–ª–∞–π –ì–æ–≥–æ–ª—å": ["–ú—ë—Ä—Ç–≤—ã–µ –¥—É—à–∏", "–í–∏–π", "–¢–∞—Ä–∞—Å –ë—É–ª—å–±–∞"],
    "–ú–∏—Ö–∞–∏–ª –õ–µ—Ä–º–æ–Ω—Ç–æ–≤": ["–ì–µ—Ä–æ–π –Ω–∞—à–µ–≥–æ –≤—Ä–µ–º–µ–Ω–∏", "–ú—Ü—ã—Ä–∏"],
    "–ò–≤–∞–Ω –¢—É—Ä–≥–µ–Ω–µ–≤": ["–û—Ç—Ü—ã –∏ –¥–µ—Ç–∏", "–ú—É–º—É"],
    "–§—ë–¥–æ—Ä –î–æ—Å—Ç–æ–µ–≤—Å–∫–∏–π": ["–ü—Ä–µ—Å—Ç—É–ø–ª–µ–Ω–∏–µ –∏ –Ω–∞–∫–∞–∑–∞–Ω–∏–µ", "–ò–¥–∏–æ—Ç", "–ë—Ä–∞—Ç—å—è –ö–∞—Ä–∞–º–∞–∑–æ–≤—ã", "–ë–µ—Å—ã"],
    "–õ–µ–≤ –¢–æ–ª—Å—Ç–æ–π": ["–í–æ–π–Ω–∞ –∏ –º–∏—Ä", "–ê–Ω–Ω–∞ –ö–∞—Ä–µ–Ω–∏–Ω–∞"],
    "–ò–≤–∞–Ω –ì–æ–Ω—á–∞—Ä–æ–≤": ["–û–±–ª–æ–º–æ–≤"],
    "–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ì—Ä–∏–±–æ–µ–¥–æ–≤": ["–ì–æ—Ä–µ –æ—Ç —É–º–∞"],
    "–ù–∏–∫–æ–ª–∞–π –ö–∞—Ä–∞–º–∑–∏–Ω": ["–ë–µ–¥–Ω–∞—è –õ–∏–∑–∞"],
    "–õ–µ–æ–Ω–∏–¥ –ê–Ω–¥—Ä–µ–µ–≤": ["–†–∞—Å—Å–∫–∞–∑ –æ —Å–µ–º–∏ –ø–æ–≤–µ—à–µ–Ω–Ω—ã—Ö"],
    "–ò–≤–∞–Ω –ë—É–Ω–∏–Ω": ["–ì–æ—Å–ø–æ–¥–∏–Ω –∏–∑ –°–∞–Ω-–§—Ä–∞–Ω—Ü–∏—Å–∫–æ"],
    "–ï–≤–≥–µ–Ω–∏–π –ó–∞–º—è—Ç–∏–Ω": ["–ú—ã"],
    "–£–∏–ª—å—è–º –®–µ–∫—Å–ø–∏—Ä": ["–ì–∞–º–ª–µ—Ç", "–†–æ–º–µ–æ –∏ –î–∂—É–ª—å–µ—Ç—Ç–∞"],
    "–î–∞–Ω—Ç–µ –ê–ª–∏–≥—å–µ—Ä–∏": ["–ë–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω–∞—è –∫–æ–º–µ–¥–∏—è"],
    "–ò–æ–≥–∞–Ω–Ω –í–æ–ª—å—Ñ–≥–∞–Ω–≥ –ì—ë—Ç–µ": ["–§–∞—É—Å—Ç", "–°—Ç—Ä–∞–¥–∞–Ω–∏—è —é–Ω–æ–≥–æ –í–µ—Ä—Ç–µ—Ä–∞"],
    "–ö–∞—Ä–ª –ú–∞—Ä–∫—Å": ["–ö–∞–ø–∏—Ç–∞–ª"],
    "–§—Ä–∏–¥—Ä–∏—Ö –ù–∏—Ü—à–µ": ["–¢–∞–∫ –≥–æ–≤–æ—Ä–∏–ª –ó–∞—Ä–∞—Ç—É—Å—Ç—Ä–∞"],
    "–î–∂–µ–π–Ω –û—Å—Ç–∏–Ω": ["–ì–æ—Ä–¥–æ—Å—Ç—å –∏ –ø—Ä–µ–¥—É–±–µ–∂–¥–µ–Ω–∏–µ"],
    "–ú–∞—Ä–∫ –¢–≤–µ–Ω": ["–ü—Ä–∏–∫–ª—é—á–µ–Ω–∏—è –ì–µ–∫–ª—å–±–µ—Ä—Ä–∏ –§–∏–Ω–Ω–∞", "–ü—Ä–∏–∫–ª—é—á–µ–Ω–∏—è –¢–æ–º–∞ –°–æ–π–µ—Ä–∞"],
    "–û—Å–∫–∞—Ä –£–∞–π–ª—å–¥": ["–ü–æ—Ä—Ç—Ä–µ—Ç –î–æ—Ä–∏–∞–Ω–∞ –ì—Ä–µ—è"],
    "–ê—Ä—Ç—É—Ä –ö–æ–Ω–∞–Ω –î–æ–π–ª": ["–ü—Ä–∏–∫–ª—é—á–µ–Ω–∏—è –®–µ—Ä–ª–æ–∫–∞ –•–æ–ª–º—Å–∞"],
    "–≠–¥–≥–∞—Ä –ê–ª–ª–∞–Ω –ü–æ": ["–£–±–∏–π—Å—Ç–≤–æ –Ω–∞ —É–ª–∏—Ü–µ –ú–æ—Ä–≥"],
    "–ì–æ–≤–∞—Ä–¥ –õ–∞–≤–∫—Ä–∞—Ñ—Ç": ["–ó–æ–≤ –ö—Ç—É–ª—Ö—É", "–¢–µ–Ω—å –Ω–∞–¥ –ò–Ω–Ω—Å–º—É—Ç–æ–º", "–£–∂–∞—Å –≤ –î–∞–Ω–≤–∏—á–µ"],
    "–î–∂–µ–∫ –õ–æ–Ω–¥–æ–Ω": ["–ú–∞—Ä—Ç–∏–Ω –ò–¥–µ–Ω"],
    "–§—Ä–∞–Ω—Ü –ö–∞—Ñ–∫–∞": ["–ü—Ä–æ—Ü–µ—Å—Å", "–ü—Ä–µ–≤—Ä–∞—â–µ–Ω–∏–µ"],
    "–î–∂–æ—Ä–¥–∂ –û—Ä—É—ç–ª–ª": ["1984", "–°–∫–æ—Ç–Ω—ã–π –¥–≤–æ—Ä"],
    "–ú–∏—Ö–∞–∏–ª –ë—É–ª–≥–∞–∫–æ–≤": ["–ú–∞—Å—Ç–µ—Ä –∏ –ú–∞—Ä–≥–∞—Ä–∏—Ç–∞", "–°–æ–±–∞—á—å–µ —Å–µ—Ä–¥—Ü–µ"]
}

def transliterate_author(full_name):
    mapping = {
        '–∞': 'a', '–±': 'b', '–≤': 'v', '–≥': 'g', '–¥': 'd', '–µ': 'e', '—ë': 'yo',
        '–∂': 'zh', '–∑': 'z', '–∏': 'i', '–π': 'y', '–∫': 'k', '–ª': 'l', '–º': 'm',
        '–Ω': 'n', '–æ': 'o', '–ø': 'p', '—Ä': 'r', '—Å': 's', '—Ç': 't', '—É': 'u',
        '—Ñ': 'f', '—Ö': 'kh', '—Ü': 'ts', '—á': 'ch', '—à': 'sh', '—â': 'shch',
        '—ä': '', '—ã': 'y', '—å': '', '—ç': 'e', '—é': 'yu', '—è': 'ya', ' ': '_'
    }
    parts = full_name.strip().split()
    if len(parts) < 2:
        raise ValueError(f"–ò–º—è –¥–æ–ª–∂–Ω–æ —Å–æ–¥–µ—Ä–∂–∞—Ç—å –∫–∞–∫ –º–∏–Ω–∏–º—É–º –∏–º—è –∏ —Ñ–∞–º–∏–ª–∏—é: {full_name}")

    last_name = parts[-1].lower()
    first_name = parts[0].lower()

    last_trans = ''.join(mapping.get(c, '') for c in last_name)
    first_trans = ''.join(mapping.get(c, '') for c in first_name)

    return f"{last_trans}_{first_trans}"

def transliterate_title(title):
    mapping = {
        '–∞': 'a', '–±': 'b', '–≤': 'v', '–≥': 'g', '–¥': 'd', '–µ': 'e', '—ë': 'yo',
        '–∂': 'zh', '–∑': 'z', '–∏': 'i', '–π': 'y', '–∫': 'k', '–ª': 'l', '–º': 'm',
        '–Ω': 'n', '–æ': 'o', '–ø': 'p', '—Ä': 'r', '—Å': 's', '—Ç': 't', '—É': 'u',
        '—Ñ': 'f', '—Ö': 'kh', '—Ü': 'ts', '—á': 'ch', '—à': 'sh', '—â': 'shch',
        '—ä': '', '—ã': 'y', '—å': '', '—ç': 'e', '—é': 'yu', '—è': 'ya'
    }

    title = title.lower().strip()
    title = re.sub(r'[^\w\s-]', '', title)
    title = re.sub(r'[-\s]+', '_', title)

    result = ''.join(mapping.get(char, char) for char in title)
    return re.sub(r'_+', '_', result).strip('_')

def clean_text(text):
    start_phrase = "–°–ø–∞—Å–∏–±–æ, —á—Ç–æ —Å–∫–∞—á–∞–ª–∏ –∫–Ω–∏–≥—É –≤ –±–µ—Å–ø–ª–∞—Ç–Ω–æ–π —ç–ª–µ–∫—Ç—Ä–æ–Ω–Ω–æ–π –±–∏–±–ª–∏–æ—Ç–µ–∫–µ Royallib.ru"
    end_phrase = "–ü—Ä–∏—è—Ç–Ω–æ–≥–æ —á—Ç–µ–Ω–∏—è!"

    start_idx = text.find(start_phrase)
    if start_idx != -1:
        end_idx = text.find(end_phrase, start_idx)
        if end_idx != -1:
            text = text[:start_idx] + text[end_idx + len(end_phrase):]

    return text.strip().lower()

session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
})

for author, titles in books_by_author.items():
    try:
        author_folder = transliterate_author(author)
    except Exception as e:
        print(f"‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å –æ–±—Ä–∞–±–æ—Ç–∞—Ç—å –∏–º—è –∞–≤—Ç–æ—Ä–∞ '{author}': {e}")
        continue

    for title in titles:
        title_clean = transliterate_title(title)

        book_page_url = f"https://royallib.com/book/{author_folder}/{title_clean}.html"
        download_url = f"https://royallib.com/get/txt/{author_folder}/{title_clean}.zip"

        print(f"\nüîç –û–±—Ä–∞–±–æ—Ç–∫–∞: {title} by {author}")
        print(f"  üìÑ –°—Ç—Ä–∞–Ω–∏—Ü–∞ –∫–Ω–∏–≥–∏: {book_page_url}")
        print(f"  üíæ ZIP-–∞—Ä—Ö–∏–≤: {download_url}")

        try:
            print("  ‚Üí –ü–æ—Å–µ—â–µ–Ω–∏–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã –∫–Ω–∏–≥–∏...")
            response = session.get(book_page_url, timeout=15)
            if response.status_code != 200:
                print(f"  ‚ùå –û—à–∏–±–∫–∞ –¥–æ—Å—Ç—É–ø–∞ –∫ —Å—Ç—Ä–∞–Ω–∏—Ü–µ –∫–Ω–∏–≥–∏: {response.status_code}")
                continue
            time.sleep(1.5)

            session.headers.update({'Referer': book_page_url})
            print("  ‚Üí –°–∫–∞—á–∏–≤–∞–Ω–∏–µ ZIP...")
            response = session.get(download_url, stream=True, timeout=15)

            if response.status_code != 200:
                print(f"  ‚ùå –û—à–∏–±–∫–∞ —Å–∫–∞—á–∏–≤–∞–Ω–∏—è ZIP: {response.status_code}")
                continue

            zip_path = os.path.join(zip_dir, f"{title_clean}.zip")
            with open(zip_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"  ‚úÖ –ê—Ä—Ö–∏–≤ —Å–æ—Ö—Ä–∞–Ω—ë–Ω: {zip_path}")

            if not zipfile.is_zipfile(zip_path):
                print("  ‚ùå –§–∞–π–ª –Ω–µ —è–≤–ª—è–µ—Ç—Å—è ZIP-–∞—Ä—Ö–∏–≤–æ–º.")
                with open(zip_path, 'r', encoding='utf-8', errors='ignore') as f:
                    head = f.read(500)
                    print(f"  üîç –ù–∞—á–∞–ª–æ —Ñ–∞–π–ª–∞:\n{head}")
                continue

            with open(zip_path, 'rb') as f:
                raw_data = f.read(1024)

            if raw_data.startswith(b'PK'):
                print("  ‚úÖ –≠—Ç–æ –Ω–∞—Å—Ç–æ—è—â–∏–π ZIP")
            else:
                try:
                    html_preview = raw_data.decode('cp1251', errors='ignore')
                    print(f"  ‚ùå –≠—Ç–æ –ù–ï ZIP, –∞ HTML:\n{html_preview[:500]}")
                except:
                    print(f"  ‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å —Ä–∞—Å–ø–æ–∑–Ω–∞—Ç—å —Å–æ–¥–µ—Ä–∂–∏–º–æ–µ: {raw_data[:100]}")

            extracted_dir = os.path.join(zip_dir, f"{title_clean}_extracted")
            os.makedirs(extracted_dir, exist_ok=True)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extracted_dir)

            txt_files = [f for f in os.listdir(extracted_dir) if f.endswith('.txt')]
            if not txt_files:
                print("  ‚ùå TXT-—Ñ–∞–π–ª –Ω–µ –Ω–∞–π–¥–µ–Ω –≤ –∞—Ä—Ö–∏–≤–µ")
                continue

            txt_path = os.path.join(extracted_dir, txt_files[0])

            content = read_with_encoding(txt_path)

            print("üîç –ü–µ—Ä–≤—ã–µ 500 —Å–∏–º–≤–æ–ª–æ–≤ –î–û –æ—á–∏—Å—Ç–∫–∏:")
            print(repr(content[:500]))

            cleaned_content = clean_text(content)

            print("\nüîç –ü–µ—Ä–≤—ã–µ 500 —Å–∏–º–≤–æ–ª–æ–≤ –ü–û–°–õ–ï –æ—á–∏—Å—Ç–∫–∏:")
            print(repr(cleaned_content[:500]))

            output_filename = f"{title} by {author}.txt"
            output_path = os.path.join(txt_dir, output_filename)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_content)

            print(f"  ‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: {output_path}")

            shutil.rmtree(extracted_dir)
            os.remove(zip_path)

        except Exception as e:
            print(f"  ‚ö†Ô∏è –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ '{title}': {e}")

print("\nüéâ –í—Å–µ –∫–Ω–∏–≥–∏ –æ–±—Ä–∞–±–æ—Ç–∞–Ω—ã!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

üîç –û–±—Ä–∞–±–æ—Ç–∫–∞: –ö–∞–ø–∏—Ç–∞–Ω—Å–∫–∞—è –¥–æ—á–∫–∞ by –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω
  üìÑ –°—Ç—Ä–∞–Ω–∏—Ü–∞ –∫–Ω–∏–≥–∏: https://royallib.com/book/pushkin_aleksandr/kapitanskaya_dochka.html
  üíæ ZIP-–∞—Ä—Ö–∏–≤: https://royallib.com/get/txt/pushkin_aleksandr/kapitanskaya_dochka.zip
  ‚Üí –ü–æ—Å–µ—â–µ–Ω–∏–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã –∫–Ω–∏–≥–∏...
  ‚Üí –°–∫–∞—á–∏–≤–∞–Ω–∏–µ ZIP...
  ‚úÖ –ê—Ä—Ö–∏–≤ —Å–æ—Ö—Ä–∞–Ω—ë–Ω: /content/drive/MyDrive/tf-idf/zip/kapitanskaya_dochka.zip
  ‚úÖ –≠—Ç–æ –Ω–∞—Å—Ç–æ—è—â–∏–π ZIP
üîç –ü–µ—Ä–≤—ã–µ 500 —Å–∏–º–≤–æ–ª–æ–≤ –î–û –æ—á–∏—Å—Ç–∫–∏:
'–°–ø–∞—Å–∏–±–æ, —á—Ç–æ —Å–∫–∞—á–∞–ª–∏ –∫–Ω–∏–≥—É –≤ –±–µ—Å–ø–ª–∞—Ç–Ω–æ–π —ç–ª–µ–∫—Ç—Ä–æ–Ω–Ω–æ–π –±–∏–±–ª–∏–æ—Ç–µ–∫–µ Royallib.ru: http://royallib.ru\n\n–í—Å–µ –∫–Ω–∏–≥–∏ –∞–≤—Ç–æ—Ä–∞: http://royallib.ru/author/pushkin_aleksandr.html\n\n–≠—Ç–∞ –∂–µ –∫–Ω–∏–≥–∞ –≤ –¥—Ä—É–≥–∏—Ö —Ñ–æ—Ä–º–∞—Ç–∞—Ö: http

KeyboardInterrupt: 

# To lowercase manual

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

folder_path = '/content/drive/MyDrive/tf-idf/txt/'

try:
    import chardet
except ImportError:
    print("–£—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º –±–∏–±–ª–∏–æ—Ç–µ–∫—É chardet...")
    !pip install chardet
    import chardet

if not os.path.exists(folder_path):
    print(f"–ü–∞–ø–∫–∞ –Ω–µ –Ω–∞–π–¥–µ–Ω–∞: {folder_path}")
else:
    txt_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.txt')]

    if not txt_files:
        print("–í –ø–∞–ø–∫–µ –Ω–µ—Ç .txt —Ñ–∞–π–ª–æ–≤.")
    else:
        print(f"–ù–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª–æ–≤: {len(txt_files)}")

        for filename in txt_files:
            file_path = os.path.join(folder_path, filename)

            try:
                with open(file_path, 'rb') as file:
                    raw_data = file.read()

                detected = chardet.detect(raw_data)
                encoding = detected['encoding']
                confidence = detected['confidence']

                print(f"{filename}: –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∞ –∫–æ–¥–∏—Ä–æ–≤–∫–∞ {encoding} (—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å: {confidence:.2f})")

                try:
                    text = raw_data.decode(encoding)
                except (UnicodeDecodeError, TypeError):
                    print(f"  –û—à–∏–±–∫–∞ –¥–µ–∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏—è {filename} –∫–∞–∫ {encoding}, –ø—Ä–æ–±—É–µ–º utf-8...")
                    text = raw_data.decode('utf-8', errors='replace')

                lower_text = text.lower()

                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(lower_text)

                print(f"‚úÖ –û–±—Ä–∞–±–æ—Ç–∞–Ω –∏ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ UTF-8: {filename}")

            except Exception as e:
                print(f"‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ {filename}: {e}")

print("–ì–æ—Ç–æ–≤–æ!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
–ù–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª–æ–≤: 26
–ö–∞–ø–∏—Ç–∞–Ω—Å–∫–∞—è –¥–æ—á–∫–∞ by –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω.txt: –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∞ –∫–æ–¥–∏—Ä–æ–≤–∫–∞ utf-8 (—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å: 0.99)
‚úÖ –û–±—Ä–∞–±–æ—Ç–∞–Ω –∏ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ UTF-8: –ö–∞–ø–∏—Ç–∞–Ω—Å–∫–∞—è –¥–æ—á–∫–∞ by –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω.txt
–ï–≤–≥–µ–Ω–∏–∏ÃÜ –û–Ω–µ–≥–∏–Ω by –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω.txt: –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∞ –∫–æ–¥–∏—Ä–æ–≤–∫–∞ utf-8 (—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å: 0.99)
‚úÖ –û–±—Ä–∞–±–æ—Ç–∞–Ω –∏ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ UTF-8: –ï–≤–≥–µ–Ω–∏–∏ÃÜ –û–Ω–µ–≥–∏–Ω by –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü—É—à–∫–∏–Ω.txt
–ú—É–º—É by –ò–≤–∞–Ω –¢—É—Ä–≥–µ–Ω–µ–≤.txt: –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∞ –∫–æ–¥–∏—Ä–æ–≤–∫–∞ utf-8 (—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å: 0.99)
‚úÖ –û–±—Ä–∞–±–æ—Ç–∞–Ω –∏ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –≤ UTF-8: –ú—É–º—É by –ò–≤–∞–Ω –¢—É—Ä–≥–µ–Ω–µ–≤.txt
–í–∏–∏ÃÜ by –ù–∏–∫–æ–ª–∞–∏ÃÜ –ì–æ–≥–æ–ª—å.txt: –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∞ –∫–æ–¥–∏—Ä