## **Mount Google drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Loading dataset**

In [None]:
import json
import pandas as pd

# Read the JSON file into a Pandas DataFrame
dataset = pd.read_json(r"/content/drive/MyDrive/email_thread_details.json")

# Use only the first 1000 entries
dataset = dataset.head(1000)

# Verify the modification
print(dataset.head())

   thread_id                     subject           timestamp  \
0          1  FW: Master Termination Log 2002-01-29 11:23:42   
1          1  FW: Master Termination Log 2002-01-31 12:50:00   
2          1  FW: Master Termination Log 2002-02-05 15:03:35   
3          1  FW: Master Termination Log 2002-02-05 15:06:25   
4          1  FW: Master Termination Log 2002-05-28 07:20:35   

                          from  \
0  Gossett, Jeffrey C. JGOSSET   
1      Theriot, Kim S. KTHERIO   
2      Theriot, Kim S. KTHERIO   
3      Theriot, Kim S. KTHERIO   
4   Kelly, Katherine L. KKELLY   

                                                  to  \
0  [Giron, Darron C. Dgiron, Love, Phillip M. Plove]   
1  [Murphy, Melissa Mmurphy, Gossett, Jeffrey C. ...   
2  [Murphy, Melissa Mmurphy, Anderson, Diane Dand...   
3  [Hall, D. Todd Thall, Sweeney, Kevin Ksweene, ...   
4                           [Germany, Chris Cgerman]   

                                                body  
0  \n\n -----Origi

# **Cleaning text & Preprocessing**

In [None]:
import re
import pandas as pd

# Text cleaning
def clean_text(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove currency signs
    text = re.sub(r'[$€£¥]', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

# Apply the cleaning function to the 'body' column
dataset['cleaned_body'] = dataset['body'].apply(clean_text)

# Print the original and cleaned text for the first entry as an example
print("Original Text:")
print(dataset['body'][0])
print("\nCleaned Text:")
print(dataset['cleaned_body'][0])

Original Text:


 -----Original Message-----
From: =09Theriot, Kim S. =20
Sent:=09Tuesday, January 29, 2002 1:23 PM
To:=09Richardson, Stacey; Anderson, Diane; Gossett, Jeffrey C.; White, Stac=
ey W.; Murphy, Melissa; Hall, D. Todd; Sweeney, Kevin
Cc:=09Aucoin, Evelyn; Baxter, Bryce; Wynne, Rita
Subject:=09FW: Master Termination Log



 -----Original Message-----
From: =09Panus, Stephanie =20
Sent:=09Tuesday, January 29, 2002 11:39 AM
To:=09Adams, Laurel; Alonso, Tom; Aronowitz, Alan; Bailey, Susan; Balfour-F=
lanagan, Cyndie; Baughman, Edward; Belden, Tim; Bishop, Serena; Brackett, D=
ebbie R.; Bradford, William S.; Browning, Mary Nell; Bruce, James; Bruce, M=
ichelle; Bruce, Robert; Buerkle, Jim; Calger, Christopher F.; Carrington, C=
lara; Considine, Keith; Cordova, Karen A.; Crandall, Sean; Cutsforth, Diane=
; Diamond, Russell; Dunton, Heather; Edison, Susan; Elafandi, Mo; Fischer, =
Mark; Flores, Nony; Fondren, Mark; Gorny, Vladimir; Gorte, David; Gresham, =
Wayne; Hagelmann, Bjorn

# **Loading spacy english model to gpu**

In [None]:
import spacy

if spacy.prefer_gpu():
    spacy.require_gpu()
    print("GPU is available. Using GPU.")
else:
    print("GPU is not available. Using CPU.")

nlp = spacy.load("en_core_web_sm")

GPU is available. Using GPU.


# **Tokenization**

In [None]:
import json

# Load spaCy English model
# nlp1 = spacy.load("en_core_web_sm")

# nlp = nlp1.to(device)

# Function to tokenize a given text using spaCy
def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Create a new column 'tokenized_body' and fill it with empty lists
dataset['tokenized_body'] = [[] for _ in range(len(dataset))]

# # Read the JSON file
# with open("/content/email_thread_details.json", "r") as file:
#     dataset = json.load(file)

# Tokenize the 'body' field for each entry in the dataset
# for entry in dataset:
#     tokenized_text = tokenize_text(entry['cleaned_body'])
#     entry['tokenized_body'] = tokenized_text
# Tokenize the 'cleaned_body' column for each row in the dataset
for index, row in dataset.iterrows():
    tokenized_text = tokenize_text(row['cleaned_body'])
    dataset.at[index, 'tokenized_body'] = tokenized_text
# Print tokenized results for the first entry as an example
# print("Original Text:")
# print(dataset[0]['cleaned_body'])
# print("\nTokenized Text:")
# print(dataset[0]['tokenized_body'])

# **Loading & Implementation of pre-trained BERT model and tokenizer on GPU**

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_bert_embeddings(tokenized_text):
    if not tokenized_text:
        return None

    inputs = tokenizer(tokenized_text, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    return embeddings

dataset['bert_embeddings'] = dataset['tokenized_body'].apply(get_bert_embeddings)

print("BERT Embeddings:")
print(dataset['bert_embeddings'][0])

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embeddings:
[[-1.0534     -0.4176954  -0.37494713 ... -0.37309474 -0.38231707
   0.33689755]
 [-0.4238697   0.0853653  -0.19224241 ... -0.00253787  0.16958223
   0.57650226]
 [-0.11286549  0.06137771  0.01990667 ... -0.23773731  0.0522394
   0.34401017]
 ...
 [-1.0534     -0.4176954  -0.37494713 ... -0.37309474 -0.38231707
   0.33689755]
 [-0.19905591  0.28754616 -0.0572002  ... -0.1876748   0.1687547
   0.61381674]
 [-1.0534     -0.4176954  -0.37494713 ... -0.37309474 -0.38231707
   0.33689755]]


## Clustering using K-Means on BERT embeddings

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


valid_embeddings = dataset['bert_embeddings'].dropna()

bert_embeddings = np.array(valid_embeddings.tolist())

flat_bert_embeddings = [np.concatenate(embedding).flatten() for embedding in bert_embeddings]

max_length = max(len(embedding) for embedding in flat_bert_embeddings)

flat_bert_embeddings = [np.pad(embedding, (0, max_length - len(embedding)), 'constant') for embedding in flat_bert_embeddings]

semantic_vectors = pca.fit_transform(flat_bert_embeddings)

scaler = StandardScaler()
semantic_vectors = scaler.fit_transform(semantic_vectors)

kmeans = KMeans(n_clusters=3)
cluster_assignments = kmeans.fit_predict(semantic_vectors)

dataset['cluster'] = valid_embeddings.index
dataset['cluster'] = dataset['cluster'].apply(lambda x: cluster_assignments[x] if x in cluster_assignments else None)

cluster_centers = kmeans.cluster_centers_
significant_vectors = pca.inverse_transform(cluster_centers)

for i, vector in enumerate(significant_vectors):
    print(f"Cluster {i + 1} Significant Vector:")
    print(vector)


  bert_embeddings = np.array(valid_embeddings.tolist())


Cluster 1 Significant Vector:
[-0.59277448 -0.08078584 -0.2106219  ... -0.04356323 -0.04464005
  0.03933678]
Cluster 2 Significant Vector:
[-0.58334998 -0.07414254 -0.20708023 ... -0.03666803 -0.03757441
  0.03311054]
Cluster 3 Significant Vector:
[-0.59597255 -0.08268966 -0.2117144  ... -0.03642764 -0.03732808
  0.03289348]




## text summarization approach using TF-IDF and cosine similarity.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

sentences = dataset['cluster'].tolist()

sentences = [str(sentence) for sentence in sentences]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)

cluster_tfidf_similarity = cosine_similarity(tfidf_matrix, significant_vectors)

normalized_scores = normalize(cluster_tfidf_similarity, axis=1, norm='l1')

total_sentence_scores = np.sum(normalized_scores, axis=1)

total_scores = normalized_scores.sum(axis=1)

summary_data = pd.DataFrame({'Sentence': sentences, 'Total Score': total_scores})

summary_data = summary_data.sort_values(by='Total Score', ascending=False)

summary_data['Rank'] = range(1, len(summary_data) + 1)

print(summary_data[['Rank', 'Sentence', 'Total Score']])


U

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# Load pre-trained BERT model and tokenizer for sequence classification
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Assuming 'train_dataset' has a column named 'body'
train_dataset_subset = train_dataset.head(10)

# Training loop
for index, row in train_dataset_subset.iterrows():
    email_body = row['body']
    inputs = tokenizer(email_body, return_tensors="pt", max_length=512, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    k = min(5, logits.size(1))
    selected_indices = torch.topk(logits, k=k, dim=1).indices.squeeze()
    selected_sentences = [tokenizer.decode(inputs['input_ids'][0, i].item()) for i in selected_indices]
    print(f"Generated Summary for Email {index}:")
    print(selected_sentences)

model.save_pretrained("/content/drive/MyDrive/email_summarizer_bert")
tokenizer.save_pretrained("/content/drive/MyDrive/email_summarizer_bert_tokenizer")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generated Summary for Email 0:
['-', '[ C L S ]']
Generated Summary for Email 1:
['-', '[ C L S ]']
Generated Summary for Email 2:
['n o t e', '[ C L S ]']
Generated Summary for Email 3:
['-', '[ C L S ]']
Generated Summary for Email 4:
['-', '[ C L S ]']
Generated Summary for Email 5:
['i', '[ C L S ]']
Generated Summary for Email 6:
['i', '[ C L S ]']
Generated Summary for Email 7:
['s u z a n n e', '[ C L S ]']
Generated Summary for Email 8:
['s u z a n n e', '[ C L S ]']
Generated Summary for Email 9:
['i', '[ C L S ]']


('/content/drive/MyDrive/email_summarizer_bert_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/email_summarizer_bert_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/email_summarizer_bert_tokenizer/vocab.txt',
 '/content/drive/MyDrive/email_summarizer_bert_tokenizer/added_tokens.json')

## loading training email and summary dataset

In [None]:
import pandas as pd

train_emails_path = "/content/drive/MyDrive/email_thread_details.json"
train_summaries_path = "/content/drive/MyDrive/email_thread_summaries.json"

train_emails = pd.read_json(train_emails_path)
train_summaries = pd.read_json(train_summaries_path)

train_dataset = pd.merge(train_emails, train_summaries, on="thread_id")

# val_emails_path = "path/to/val_emails.json"
# val_summaries_path = "path/to/val_summaries.json"

# val_emails_df = pd.read_json(val_emails_path)
# val_summaries_df = pd.read_json(val_summaries_path)

# val_dataset = pd.merge(val_emails_df, val_summaries_df, on="thread_id")


In [None]:
print (train_dataset)

       thread_id                     subject           timestamp  \
0              1  FW: Master Termination Log 2002-01-29 11:23:42   
1              1  FW: Master Termination Log 2002-01-31 12:50:00   
2              1  FW: Master Termination Log 2002-02-05 15:03:35   
3              1  FW: Master Termination Log 2002-02-05 15:06:25   
4              1  FW: Master Termination Log 2002-05-28 07:20:35   
...          ...                         ...                 ...   
21679       4166                    vacation 2000-10-04 11:32:00   
21680       4167                    web file 2001-03-18 22:57:00   
21681       4167                    web file 2001-03-19 04:42:00   
21682       4167                    web file 2001-03-19 09:57:00   
21683       4167                    web file 2001-03-19 15:42:00   

                              from  \
0      Gossett, Jeffrey C. JGOSSET   
1          Theriot, Kim S. KTHERIO   
2          Theriot, Kim S. KTHERIO   
3          Theriot, Kim S. KTHE

In [None]:
pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


## BART model and tokenizer

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

# train_dataset = pd.read_json("path/to/train_dataset.json")

train_dataset_subset = train_dataset.head(100)

# Training loop
for index, row in train_dataset_subset.iterrows():
    email_body = row['body']

    inputs = tokenizer.encode("summarize: " + email_body, return_tensors="pt", max_length=1024, truncation=True).to(device)

    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(f"Generated Summary for Email {index}:")
    print(summary)


model.save_pretrained("/content/drive/MyDrive/email_summarizer")
tokenizer.save_pretrained("/content/drive/MyDrive/email_summarizer_tokenizer")


Using device: cuda
Generated Summary for Email 0:
The Daily Termination List for January 25 as well as the Master Termination Log, which incorporates all terminations received through January 25. The following were previously on the Master termination Log and have now been marked as "Y" for a valid termination: Atlantic Coast Fibers, Inc. (including Premier), CNC-Containers Corporation, NGL Supply, Inc., and NGL Energy Partners.
Generated Summary for Email 1:
The Daily Lists for January 29 and January 30 as well as the M= purposefullyaster Termination Log, which incorporates all terminations received through= January 30. prepetition mutual terminations have been added to this list. They are identified under "Nature of Default" as "mutual terminatio=                n".
Generated Summary for Email 2:
Stephanie Panus: Please remove my name as well as Melissa Murphy's from the dist=ribution list below. Todd D. Hall, Kevin Sweeney, Rebecca Grace, Rhonda Robinson, Kerri Thomspon, Kristin Alb

('/content/drive/MyDrive/email_summarizer_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/email_summarizer_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/email_summarizer_tokenizer/vocab.json',
 '/content/drive/MyDrive/email_summarizer_tokenizer/merges.txt',
 '/content/drive/MyDrive/email_summarizer_tokenizer/added_tokens.json')

## generating summary

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_path = "/content/drive/MyDrive/email_summarizer"
tokenizer_path = "/content/drive/MyDrive/email_summarizer_tokenizer"

tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
model = BartForConditionalGeneration.from_pretrained(model_path).to(device)

def generate_summary(email_body):
    inputs = tokenizer.encode("summarize: " + email_body, return_tensors="pt", max_length=1024, truncation=True).to(device)

    summary_ids = model.generate(inputs, max_length=30, min_length=0, length_penalty=2.0, num_beams=4, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

email_text = """
    Amanda,\n\nPlease move the file i sent you from the Testing tab to the West directory \nunder the Pipeline tab.  I have updated it in the Testing tab so discard the \nversion i emailed.  Thanks.\n\nMat\n
"""
generated_summary = generate_summary(email_text)

print("Generated Summary:")
print(generated_summary)

Using device: cpu
Generated Summary:
Please move the file i sent you from the Testing tab to the West directory  under the Pipeline tab. I have updated it in the


In [None]:
pip install transformers



## transformers library's pipeline for text summarization

In [None]:
from transformers import pipeline

# Instantiate a text summarization pipeline on GPU
summarizer = pipeline('summarization')  # 0 indicates GPU, change it based on your GPU index

train_dataset_subset = train_dataset.head(10)

# Example usage within your code
for index, row in train_dataset_subset.iterrows():
    email_body = row['body']  # Assuming the column name is 'body'

    # Generate a summary using the text summarization pipeline
    summary = summarizer(email_body, max_length=30, min_length=10, do_sample=False)

    # Access the generated summary
    generated_summary = summary[0]['summary_text']  # Updated key to 'summary_text'

    # Print or use the generated summary as needed
    print(f"Generated Summary for Email {index}:")
    print(generated_summary)


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Generated Summary for Email 0:
 Daily Termination List for January 25 as well as the Master Termination Log, which incorporates all terminations received through January 25
Generated Summary for Email 1:
 The Daily Lists for January 29 and January 30 as well as the M=677aster Termination Log, which incorporates all terminations received
Generated Summary for Email 2:
 The Daily List for January 31 as well as the Master Termination Log, which incorporates all terminations received through January 31 .
Generated Summary for Email 3:
 The Daily List for February 4 as well as the Master Termination Log, which incorporates all termination received through February 4 (with the exception


Your max_length is set to 30, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Generated Summary for Email 4:
 The Daily List for May 24, 2002 as well as the Master Terminati=Georgian Log, which incorporates all terminations received through


Your max_length is set to 30, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Generated Summary for Email 5:
 I'll be there... I'll get there... and I'm going to be there . I'll always be there, and I'll
Generated Summary for Email 6:
 I will attend. I will . I will be there . I'll be there. I'll attend . I'm going to be there
Generated Summary for Email 7:
 Suzanne: Please send an e-mail to each of the credit folks concerning the 5th.  Please include the description that I have
Generated Summary for Email 8:
 Suzanne: Could you please check the names of Cathy Tudon and Nidia Martinez? Suzanne: I wasn't sure if who we sent
Generated Summary for Email 9:
 A lunch meeting has been scheduled for Friday, May 5, 2000, from 12:00 p.m. to 1:30 p


In [None]:
from transformers import pipeline

summarizer = pipeline('summarization', device = 0)

email_text = """
    Amanda,\n\nPlease move the file i sent you from the Testing tab to the West directory \nunder the Pipeline tab.  I have updated it in the Testing tab so discard the \nversion i emailed.  Thanks.\n\nMat\n
"""

summary = summarizer(email_text, max_length=30, min_length=0, do_sample=False)

generated_summary = summary[0]['summary_text']

print("Generated Summary:")
print(generated_summary)


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Generated Summary:
 The file i sent you from the Testing tab to the West directory  under the Pipeline tab . I have updated it in the Testing
