In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
import chardet
import numpy as np

In [2]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

In [3]:
def read_mycsv(path):
    with open(path, 'rb') as f:
        result = chardet.detect(f.read())
    return pd.read_csv(path, encoding=result['encoding'])

In [4]:
text = pd.read_excel('CPED - NPUR.xlsx')
text.head(20)

Unnamed: 0,Original text,Segmented text,Example structure of outputs of analysis,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16359,Unnamed: 16360,Unnamed: 16361,Unnamed: 16362,Unnamed: 16363,Unnamed: 16364,Unnamed: 16365,Unnamed: 16366,Unnamed: 16367,Unnamed: 16368
0,,,,,,,,,,,...,,,,,,,,,,
1,,,Submission details,,,,,,Issue analysis,,...,,,,,,,,,,
2,,,Submission ID,Submission method,Link to submission,Date,Proforma,Issue no.,Issue Summary,Key Issue,...,Column16345,Column16346,Column16347,Column16348,Column16349,Column16350,Column16351,Column16352,Column16353,Column16354
3,,,NP1,Email,RM ref no.,2016-02-10 00:00:00,No,1,Concerned about loss of character protection a...,Character and heritage,...,,,,,,,,,,
4,,,NP1,Email,RM ref no.,2016-02-10 00:00:00,No,2,Support new park along river,Open space,...,,,,,,,,,,
5,"""believe there are a number of issues that nee...",1) mechanisms\n• is surrounded by high levels ...,,,,,,,,,...,,,,,,,,,,
6,,"2) • is identified within Footprint, although ...",,,,,,,,,...,,,,,,,,,,
7,,3) and • nature conservation values bordering ...,,,,,,,,,...,,,,,,,,,,
8,,6) and • area is identified as having high to ...,,,,,,,,,...,,,,,,,,,,
9,,4) • and improve the main waterway corridor th...,,,,,,,,,...,,,,,,,,,,


In [5]:
def preprocess_text(doc):
    if(not(pd.isnull(doc))):
        doc = doc.strip().replace("\n","")
    return doc

In [6]:
text['Original text'] = text['Original text'].apply(preprocess_text) 
text['Segmented text'] = text['Segmented text'].apply(preprocess_text) 

In [7]:
def tokenized_text(data):
    return tokenizer.encode(data, return_tensors="pt", max_length= 1000).to(device)

In [8]:
for i, row in text.iterrows():
    temp = row['Original text']
    if(not(pd.isnull(temp))):
        temp = "summarize: "+ temp
        temp = tokenized_text(temp)
        t = model.generate(temp,
                           num_beams=4,
                           no_repeat_ngram_size=2,
                           min_length=30,
                           max_length=900,
                           early_stopping=True)
        text.at[i, 'Original summ'] = tokenizer.decode(t[0], skip_special_tokens=True)

In [9]:
text[:20]

Unnamed: 0,Original text,Segmented text,Example structure of outputs of analysis,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 16360,Unnamed: 16361,Unnamed: 16362,Unnamed: 16363,Unnamed: 16364,Unnamed: 16365,Unnamed: 16366,Unnamed: 16367,Unnamed: 16368,Original summ
0,,,,,,,,,,,...,,,,,,,,,,
1,,,Submission details,,,,,,Issue analysis,,...,,,,,,,,,,
2,,,Submission ID,Submission method,Link to submission,Date,Proforma,Issue no.,Issue Summary,Key Issue,...,Column16346,Column16347,Column16348,Column16349,Column16350,Column16351,Column16352,Column16353,Column16354,
3,,,NP1,Email,RM ref no.,2016-02-10 00:00:00,No,1,Concerned about loss of character protection a...,Character and heritage,...,,,,,,,,,,
4,,,NP1,Email,RM ref no.,2016-02-10 00:00:00,No,2,Support new park along river,Open space,...,,,,,,,,,,
5,"""believe there are a number of issues that nee...",1) mechanisms• is surrounded by high levels of...,,,,,,,,,...,,,,,,,,,,"""believe there are a number of issues that nee..."
6,,"2) • is identified within Footprint, although ...",,,,,,,,,...,,,,,,,,,,
7,,3) and • nature conservation values bordering ...,,,,,,,,,...,,,,,,,,,,
8,,6) and • area is identified as having high to ...,,,,,,,,,...,,,,,,,,,,
9,,4) • and improve the main waterway corridor th...,,,,,,,,,...,,,,,,,,,,


In [10]:
(text.loc[12,'Original text'])

'I would like to express my concerns with the planned expansion of the ferny grove and upper kedron precinct and increased population density. road is already chaotic for us at the moment at peak times - taking over 30 mins just from ferny grove to at times. of the train stations alongside the ferny grove line have space for car park after 7:30am, and whenever there is a fault, which now seems to happen once every week, buses from to ferny grove can take over 1 hr to get there. need better transport solutions immediately before any more houses are made available....'

In [11]:
(text.loc[12,'Original summ'])

'road is already chaotic for us at peak times - taking over 30 mins just from ferny grove. of the train stations alongside the line have space for car park after 7:30am, and whenever there is a fault, buses can take over 1 hr to get there. need better transport solutions immediately before any more houses are made available...'

In [12]:
for i, row in text.iterrows():
    tempp = row['Segmented text']
    if(not(pd.isnull(tempp))):
        tempp = "summarize: "+ tempp
        tempp = tokenized_text(tempp)
        tt = model.generate(tempp,
                            num_beams=4,
                            no_repeat_ngram_size=2,
                            min_length=30,
                            max_length=300,
                            early_stopping=True)
        text.at[i, 'seg summ'] = tokenizer.decode(tt[0], skip_special_tokens=True)

In [13]:
(text.loc[15,'Segmented text'])

'its heading in the right direction. completely understand there are time frames, would like to see the upgrade in infrastructure happen fast.. with new residential development taking place in it is imperative that there are options for young families to shop and dine in a modern complex.'

In [14]:
(text.loc[15,'seg summ'])

'new residential development taking place in it is imperative that there are options for young families to shop and dine in a modern complex. the development is currently in the right direction and will be able to be upgraded to the next level in time frame.'

In [15]:
text = text[['Original text','Original summ','Segmented text','seg summ']]

In [16]:
text[4:24]

Unnamed: 0,Original text,Original summ,Segmented text,seg summ
4,,,,
5,"""believe there are a number of issues that nee...","""believe there are a number of issues that nee...",1) mechanisms• is surrounded by high levels of...,surrounded by high levels of protection for co...
6,,,"2) • is identified within Footprint, although ...",zoned rural and special facilities have signif...
7,,,3) and • nature conservation values bordering ...,ecosystem services value associated with water...
8,,,6) and • area is identified as having high to ...,sediment entering the waterways will impact al...
9,,,4) • and improve the main waterway corridor th...,a 100m buffer (minimum)on either side of the w...
10,a resident of and a frequent traveller on the ...,"new housing developments in are complete, the ...",a resident of and a frequent traveller on the ...,"new housing developments in are complete, the ..."
11,"I have just built a new home in Road, Kedron. ...",big problem for me is that I cannot get any in...,"I have just built a new home in Road, Kedron. ...",big problem for me is that I cannot get any in...
12,I would like to express my concerns with the p...,road is already chaotic for us at peak times -...,I would like to express my concerns with the p...,the expansion of the ferny grove and upper ked...
13,,,Road is already chaotic for us at the moment a...,buses from to ferny grove can take over 1 hr t...


In [17]:
text[4:].to_csv('Summary_v12.csv', index=False)