In [None]:
!pip install pytesseract opencv-python Pillow pandas nltk gensim sumy PyDrive

In [33]:
 !sudo apt install tesseract-ocr tesseract-ocr-ben

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
tesseract-ocr-ben is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [34]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## OCR

In [35]:
import pytesseract
from PIL import Image
import cv2
import os
import pandas as pd
from datetime import datetime

image_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/data/'
output_text_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/text/raw/'
os.makedirs(output_text_dir, exist_ok=True)


for filename in os.listdir(image_dir):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(image_dir, filename)

        img = cv2.imread(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = cv2.medianBlur(gray, 3)
        gray = cv2.resize(gray, None, fx=2.0, fy=2.0)

        pil_img = Image.fromarray(gray)

        try:
            text = pytesseract.image_to_string(pil_img, lang='ben')
        except Exception as e:
            text = ""
            print(f"OCR failed for {filename}: {e}")

        output_file = os.path.join(output_text_dir, os.path.splitext(filename)[0] + '.txt')
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(text)

## Cleaning

In [36]:
import os
import re

raw_text_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/text/raw/'
clean_text_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/text/clean/'
os.makedirs(clean_text_dir, exist_ok=True)

bangla_pattern = re.compile(r'[^\u0980-\u09FF\s।]')

def segment_sentences(text):
    sentences = re.split(r'।|\n', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

for filename in os.listdir(raw_text_dir):
    if filename.endswith('.txt'):
        raw_file_path = os.path.join(raw_text_dir, filename)

        with open(raw_file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        text = bangla_pattern.sub(' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        sentences = segment_sentences(text)

        cleaned_text = '\n'.join(sentences)

        clean_file_path = os.path.join(clean_text_dir, filename)
        with open(clean_file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

print(f"Text cleaning completed. Cleaned files saved in: {clean_text_dir}")


Text cleaning completed. Cleaned files saved in: /content/drive/MyDrive/BanglaNewsSummarizer/output/text/clean/


## Remove stopwords

In [37]:

clean_text_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/text/clean/'
no_stopword_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/text/no_stopwords/'
stopwords_file = '/content/drive/MyDrive/BanglaNewsSummarizer/resources/stopwords-bn.txt'
os.makedirs(no_stopword_dir, exist_ok=True)

with open(stopwords_file, 'r', encoding='utf-8') as f:
    stopwords = set([line.strip() for line in f if line.strip()])

def remove_stopwords(text, stopwords):
    words = text.split()
    filtered_words = [w for w in words if w not in stopwords]
    return ' '.join(filtered_words)

for filename in os.listdir(clean_text_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(clean_text_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        filtered_text = remove_stopwords(text, stopwords)

        output_file = os.path.join(no_stopword_dir, filename)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(filtered_text)

print(f"Stopword removal completed. Processed files saved in: {no_stopword_dir}")


Stopword removal completed. Processed files saved in: /content/drive/MyDrive/BanglaNewsSummarizer/output/text/no_stopwords/


In [38]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Tokenization and Preprocessing

In [40]:
import os
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

no_stopword_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/text/no_stopwords/'
preprocessed_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/text/preprocessed/'
os.makedirs(preprocessed_dir, exist_ok=True)

def preprocess_for_summary(text):
    sentences = sent_tokenize(text)

    tokenized_sentences = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [w.lower() for w in words if w.strip()]
        tokenized_sentences.append(' '.join(words))

    preprocessed_text = '\n'.join(tokenized_sentences)
    return preprocessed_text

for filename in os.listdir(no_stopword_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(no_stopword_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        preprocessed_text = preprocess_for_summary(text)

        output_file = os.path.join(preprocessed_dir, filename)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(preprocessed_text)

print(f"Tokenization & preprocessing completed. Files saved in: {preprocessed_dir}")


Tokenization & preprocessing completed. Files saved in: /content/drive/MyDrive/BanglaNewsSummarizer/output/text/preprocessed/


## Output & Summaries

In [42]:
import os
import json
import pandas as pd
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

preprocessed_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/text/preprocessed/'
image_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/data/'
summary_output_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/summaries/'
file_output_dir = '/content/drive/MyDrive/BanglaNewsSummarizer/output/'
os.makedirs(summary_output_dir, exist_ok=True)

csv_path = os.path.join(file_output_dir, 'summaries.csv')
json_path = os.path.join(file_output_dir, 'summaries.json')

def generate_drive_link(filename):
    return f"https://drive.google.com/drive/folders/1RtjjlESwGPELEhpq7LhBr-zVRk41hnQT?usp=sharing/{filename}"

def frequency_based_summary(text, ratio=0.2):
    sentences = text.split('\n')
    word_freq = {}
    for sent in sentences:
        for word in sent.split():
            word_freq[word] = word_freq.get(word, 0) + 1
    sentence_scores = {sent: sum(word_freq.get(w, 0) for w in sent.split()) for sent in sentences}
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    n = max(1, int(len(sentences) * ratio))
    return '। '.join(top_sentences[:n]) + '।'

class BanglaTokenizer(Tokenizer):
    def __init__(self):
        super().__init__('english')
    def to_sentences(self, text):
        return [s.strip() for s in text.split('\n') if s.strip()]


summaries = []

for filename in os.listdir(preprocessed_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(preprocessed_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read().strip()

        lines = [line.strip() for line in text.split('\n') if line.strip()]
        if len(lines) > 0:
            first_line = lines[0]
            first_sentence = first_line.split('।')[0]
            words = first_sentence.split()
            headline = ' '.join(words[:12]) + ('...' if len(words) > 12 else '')
        else:
            headline = filename.replace('.txt', '')


        parser = PlaintextParser.from_string(text, BanglaTokenizer())
        summarizer = LexRankSummarizer()
        summary_sentences = summarizer(parser.document, sentences_count=max(1, int(len(lines) * 0.2)))
        summary = '। '.join([str(sentence) for sentence in summary_sentences]) + '।'


        if len(summary.strip()) < 15:
            summary = frequency_based_summary(text)


        summary_filename = filename.replace('.txt', '_summary.txt')
        summary_path = os.path.join(summary_output_dir, summary_filename)
        with open(summary_path, 'w', encoding='utf-8') as sf:
            sf.write(summary)


        summaries.append({
            'title': headline,
            'link': generate_drive_link(filename.replace('.txt', '.jpg')),
            'summary': summary
        })

df = pd.DataFrame(summaries)
df.to_csv(csv_path, index=False, encoding='utf-8-sig')


with open(json_path, 'w', encoding='utf-8') as jf:
    json.dump(summaries, jf, ensure_ascii=False, indent=2)


df.head()


Unnamed: 0,title,link,summary
0,সংস্কৃতিকমী নর খোকন ৪৯ সাড়ে ১১ ৃ রা ইন্না লিল...,https://drive.google.com/drive/folders/1RtjjlE...,সংস্কৃতিকমী নর খোকন ৪৯ সাড়ে ১১ ৃ রা ইন্না লিল...
1,চ গ্রামে ১ ৭০০ কেজি চা জব্দ নিজস্ব প্রতিবেদক চ...,https://drive.google.com/drive/folders/1RtjjlE...,চ গ্রামে ১ ৭০০ কেজি চা জব্দ নিজস্ব প্রতিবেদক চ...
2,অম্ন্যাশয় পাচক রস তৈরি চেয়েছিলেন বিচিত্র ইউপ...,https://drive.google.com/drive/folders/1RtjjlE...,অম্ন্যাশয় পাচক রস তৈরি চেয়েছিলেন বিচিত্র ইউপ...
3,গম ৬ মহাসড়কে বাসের যাত্রীদের টার্গেট ছিনতাই ৮...,https://drive.google.com/drive/folders/1RtjjlE...,গম ৬ মহাসড়কে বাসের যাত্রীদের টার্গেট ছিনতাই ৮...
4,আআ বিষয়ে মতিন বছরের গ তীর শক্মা ব্যাংকে ্রা্ট...,https://drive.google.com/drive/folders/1RtjjlE...,আআ বিষয়ে মতিন বছরের গ তীর শক্মা ব্যাংকে ্রা্ট...
