<a href="https://colab.research.google.com/github/KavishaMadani/Text_Summarization_using_Optimized_Cur/blob/main/Text_Summarization_using_Optimized_CUR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Text Summarization using DIEM CUR

In [None]:
import pandas as pd

In [None]:
df=pd.read_excel("news_data.xlsx")
df

Unnamed: 0,text,ctext
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...
...,...,...
95,Shops at the domestic terminal of Delhi Airpor...,"New Delhi, Aug 1 (PTI) Shops at the domestic t..."
96,The US Air Force is negotiating with Boeing to...,Facing pressure from President Donald Trump to...
97,The Allahabad High Court on Tuesday said it wo...,"Allahabad, Aug 01 (PTI) The Allahabad High Cou..."
98,As many as 13 people died while travelling in ...,Motorman Mahendra Prasad called up the railway...


In [None]:
import pandas as pd
import time
import tensorflow_hub as hub
import numpy as np
import math
import re

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def encode_sentences(sentences):
    return embed(sentences)

def CUR(matrix):
    U, s, Vt = np.linalg.svd(matrix, full_matrices=False)
    k = min(matrix.shape) // 2
    s_k = np.diag(s[:k])
    C = np.dot(U[:, :k], s_k)
    R = np.dot(s_k, Vt[:k, :])
    return C, R

def summarize(paragraph, ratio, use_first=True):
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', paragraph)
    embeddings = encode_sentences(sentences)
    num_sentences = embeddings.shape[0]
    sample_num = math.floor(ratio * num_sentences)
    C, R = CUR(embeddings)
    row_indices = np.argsort(np.sum(np.abs(C), axis=1))[::-1]
    row_indices = row_indices[:sample_num]
    summary_sentences = []
    if use_first:
        if row_indices[0] != 0:
            summary_sentences.append(sentences[0])
    for r in row_indices:
        summary_sentences.append(sentences[r])
    return ' '.join(summary_sentences)

start_time = time.time()
df['stext'] = df['ctext'].apply(lambda x: summarize(x, 0.5, use_first=True))
end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken for Text summarization using CUR: {time_taken} seconds")
df[['ctext', 'stext']].head()

Time taken for Text summarization using CUR: 2.434907913208008 seconds


Unnamed: 0,ctext,stext
0,The Daman and Diu administration on Wednesday ...,The Daman and Diu administration on Wednesday ...
1,"From her special numbers to TV?appearances, Bo...","From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Mumbai and other Indian cities are t...,Hotels in Mumbai and other Indian cities are t...


In [None]:
!pip install rouge



In [None]:
from rouge import Rouge

def evaluate_summarization(df):
    rouge = Rouge()
    rouge_scores = {
        'rouge-1': {'precision': 0, 'recall': 0, 'f_score': 0},
        'rouge-2': {'precision': 0, 'recall': 0, 'f_score': 0},
        'rouge-l': {'precision': 0, 'recall': 0, 'f_score': 0}
    }

    for index, row in df.iterrows():
        original_text = row['ctext']
        summarized_text = row['stext']
        scores = rouge.get_scores([summarized_text], [original_text])

        scores = scores[0]

        for key in rouge_scores.keys():
            if key in scores:
                rouge_scores[key]['precision'] += scores[key]['p']
                rouge_scores[key]['recall'] += scores[key]['r']
                rouge_scores[key]['f_score'] += scores[key]['f']

    num_rows = len(df)
    for key in rouge_scores.keys():
        if rouge_scores[key]['precision'] > 0:
            rouge_scores[key]['precision'] /= num_rows
            rouge_scores[key]['recall'] /= num_rows
            rouge_scores[key]['f_score'] /= num_rows

    return rouge_scores

rouge_scores = evaluate_summarization(df)
for key, value in rouge_scores.items():
    print(f"{key}: Precision={value['precision']}, Recall={value['recall']}, F-score={value['f_score']}")


rouge-1: Precision=1.0, Recall=0.5867802664377397, F-score=0.7273520938387533
rouge-2: Precision=0.9743418347043066, Recall=0.5106477191675524, F-score=0.653735802206098
rouge-l: Precision=0.9996350364963504, Recall=0.5866184541400051, F-score=0.7271278785921166


# Text Summarization using Fast Deterministic CUR

In [None]:
def fast_det_cur(X, p, Q, lambda_values, epsilon):
    A = set(range(p))
    W = np.zeros((p, p))
    W_tilde = np.zeros(p)
    G = np.dot(X.T, X)

    for o in range(Q):

        for i in A:
            upper_bound_K_i = compute_upper_bound(W, W_tilde, G, i)
            if upper_bound_K_i <= lambda_values[o]:
                W[i] = np.zeros(p)
            else:
                update_W_i(W, W_tilde, G, i, lambda_values[o], X)

        lower_bounds = compute_lower_bound(W, W_tilde, G, lambda_values[o])
        M = set(i for i, lower_bound in enumerate(lower_bounds) if lower_bound > lambda_values[o])
        A = A - M

        if np.allclose(W, W_tilde, atol=epsilon):
            break
    return W

def compute_upper_bound(W, W_1, G, i):
    delta_W_i_norm_squared = np.linalg.norm(W - W_1)**2
    G_i_norm_squared = G[i, i]
    delta_W_norm_F = np.linalg.norm(W - W_1)
    delta_W_i_norm = np.linalg.norm(W[i] - W_1[i])
    delta = (delta_W_norm_F**2 - delta_W_i_norm**2) / 2 + delta_W_i_norm**2
    return W_1[i] + np.sqrt(delta + G_i_norm_squared)

def compute_lower_bound(W, W_1, G, lambda_value):
    delta_W_norm_F_squared = np.linalg.norm(W - W_1)**2
    delta_W_squared = np.linalg.norm(W - W_1)**2
    G_norm_squared = np.sum(G**2, axis=0)
    delta = (delta_W_norm_F_squared - delta_W_squared) / 2
    return W_1 - np.sqrt(delta + 2*lambda_value*G_norm_squared)

def update_W_i(W, W_1, G, i, lambda_value, X):
    zi = np.dot(X[i].T, X - np.dot(np.dot(X[:, W_1 != 0], np.linalg.pinv(X[:, W_1 != 0])), X))
    den = 1 - lambda_value / np.linalg.norm(zi)**2
    if den > 0:
        W[i] = den * np.dot(zi, zi.T)
    else:
        W[i] = 0
    return W


def compute_cur_matrices(X):
    p = min(X.shape) // 2
    Q = 5
    lambda_values = np.random.rand(Q)
    epsilon = 1e-6

    X = np.random.rand(p, p)
    W = fast_det_cur(X, p, Q, lambda_values, epsilon)
    W = W / 100
    C = X[:, np.sum(W, axis=0) != 0]
    U = np.linalg.pinv(C) @ X
    R = np.diag(W[W != 0])
    G = np.dot(X.T, X)
    G_norm_squared = np.sum(G**2, axis=1)

    return C, U, R


In [None]:
p = 10
Q = 5
lambda_values = np.random.rand(Q)
epsilon = 1e-6

X = np.random.rand(p, p)
W = fast_det_cur(X, p, Q, lambda_values, epsilon)

In [None]:
def encode_sentences(sentences):
    return embed(sentences)

def summarize(paragraph, ratio, use_first=True):
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', paragraph)
    embeddings = encode_sentences(sentences)
    num_sentences = embeddings.shape[0]
    sample_num = math.floor(ratio * num_sentences)
    k = min(embeddings.shape) // 2
    embeddings = np.array(embeddings)
    C, U, R = compute_cur_matrices(embeddings)
    row_indices = np.argsort(np.sum(np.abs(C), axis=1))[::-1]
    row_indices = row_indices[:sample_num]
    summary_sentences = []

    if use_first:
        if row_indices[0] != 0:
            summary_sentences.append(sentences[0])
    for r in row_indices:
        if r < len(sentences):
            summary_sentences.append(sentences[r])
    return ' '.join(summary_sentences)

In [None]:
def summarize_text(row):
    paragraph = row['ctext']
    ratio = 0.5  # Summarize to 50% of the original text
    summary = summarize(paragraph, ratio)
    return summary

In [None]:
start_time = time.time()
df['stext'] = df.apply(summarize_text, axis=1)
end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken for text summarization using Fast Deterministic approach for CUR: {time_taken} seconds")
df.to_csv('summarized_dataset.csv', index=False)
df.head()

Time taken for text summarization using Fast Deterministic approach for CUR: 1.8460538387298584 seconds


Unnamed: 0,text,ctext,stext
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo...","From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...,Hotels in Mumbai and other Indian cities are t...


In [None]:
from rouge import Rouge

def evaluate_summarization_fast_det_cur(df):
    rouge = Rouge()
    rouge_scores = {
        'rouge-1': {'precision': 0, 'recall': 0, 'f_score': 0},
        'rouge-2': {'precision': 0, 'recall': 0, 'f_score': 0},
        'rouge-l': {'precision': 0, 'recall': 0, 'f_score': 0}
    }
    for index, row in df.iterrows():
        original_text = row['ctext']
        summarized_text = row['stext']
        scores = rouge.get_scores([summarized_text], [original_text])
        scores = scores[0]
        for key in rouge_scores.keys():
            if key in scores:
                rouge_scores[key]['precision'] += scores[key]['p']
                rouge_scores[key]['recall'] += scores[key]['r']
                rouge_scores[key]['f_score'] += scores[key]['f']
    num_rows = len(df)
    for key in rouge_scores.keys():
        if rouge_scores[key]['precision'] > 0:
            rouge_scores[key]['precision'] /= num_rows
            rouge_scores[key]['recall'] /= num_rows
            rouge_scores[key]['f_score'] /= num_rows
    return rouge_scores

rouge_scores = evaluate_summarization_fast_det_cur(df)
for key, value in rouge_scores.items():
    print(f"{key}: Precision={value['precision']}, Recall={value['recall']}, F-score={value['f_score']}")

rouge-1: Precision=1.0, Recall=0.5726788480065357, F-score=0.7198359932640923
rouge-2: Precision=0.9799219802933272, Recall=0.5054353531602179, F-score=0.6570836950051218
rouge-l: Precision=0.9997499999999999, Recall=0.572549398168348, F-score=0.7196654175711287


In [None]:
stext1 = df.loc[0, 'stext']
print(stext1)

The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7. The circular was withdrawn through a one-line order issued late in the evening by the UT?s department of personnel and administrative reforms.?The circular is ridiculous. In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the governm