# ANOTHER APPROACH (USING PRE-TRAINED MODEL)

In [1]:
! pip install transformers -q

[K     |████████████████████████████████| 4.2 MB 5.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 11.3 MB/s 
[K     |████████████████████████████████| 84 kB 1.4 MB/s 
[K     |████████████████████████████████| 596 kB 38.5 MB/s 
[?25h

DATA PRE-PROCESSING TO SEPARATE TWEETS AS PER LABEL

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

path = "/content/training_task_c_tweets.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,in_reply_to_status_id_str,text,user.verified,user.followers_count,retweet_count,favorite_count,entities.hashtags,entities.urls,label,parent_tweet_text
0,552785374507175936,552783667052167168,MT @euronews France: 10 dead after shooting at...,False,1828,1,2,"[{'indices': [70, 83], 'text': 'CharlieHebdo'}]",[],comment,France: 10 people dead after shooting at HQ of...
1,552786226546495488,552785374507175936,@j0nathandavis They who? Stupid and partial op...,False,405,0,0,[],[],deny,MT @euronews France: 10 dead after shooting at...
2,552789345628069888,552786226546495488,"@nanoSpawn Socialists, Antisemites, anti zioni...",False,1828,0,0,[],[],comment,@j0nathandavis They who? Stupid and partial op...
3,552792862610694144,552783667052167168,@euronews @TradeDesk_Steve A French crime of p...,False,26,0,0,[],[],query,France: 10 people dead after shooting at HQ of...
4,552943855021330432,552783667052167168,"@euronews LOL. 5 million Muslims in France, wh...",False,14,0,0,[],[],comment,France: 10 people dead after shooting at HQ of...


In [3]:
# df1 is a data frame which groups all the reply tweets as per their parent tweet

df1 = df.groupby(by=['in_reply_to_status_id_str', 'label'])['text'].apply(list).reset_index(name='text_list')

In [4]:
df1.to_csv('temp_grouped_output.csv')
df1

Unnamed: 0,in_reply_to_status_id_str,label,text_list
0,498280126254428160,comment,"[@MichaelSkolnik -- wow, @MichaelSkolnik Unbel..."
1,498280126254428160,support,[“@MichaelSkolnik: Mike Brown was staying with...
2,498430783699554305,comment,[@MichaelSkolnik cool! Darkskinned man is a cr...
3,498430783699554305,support,[@MichaelSkolnik @AC_BOwen #Cleveland @CivilRi...
4,498432131669192704,comment,[@Agent_Kindi&lt;&lt;&lt;&lt; @SecretService I...
...,...,...,...
2138,581473088249958400,comment,[@jjauthor @gatewaypundit. I knew it! Those cr...
2139,581473088249958400,query,[@jjauthor @gatewaypundit Isn't Islam a wonder...
2140,581473088249958400,support,[@jjauthor @gatewaypundit i knew that scarf he...
2141,581573165492523008,deny,[@EdWardMDBlog @USATODAY @khjelmgaard all rumo...


In [5]:
#check type of the data df1

print(type(df1))
df1.columns.tolist()

<class 'pandas.core.frame.DataFrame'>


['in_reply_to_status_id_str', 'label', 'text_list']

In [6]:
#test
df1[(df1['label'] == 'support')]

Unnamed: 0,in_reply_to_status_id_str,label,text_list
1,498280126254428160,support,[“@MichaelSkolnik: Mike Brown was staying with...
3,498430783699554305,support,[@MichaelSkolnik @AC_BOwen #Cleveland @CivilRi...
5,498433698149056513,support,[@Supreme___Power @MichaelSkolnik You so edgy.]
12,499366666300846081,support,[Line of police cars with high beams on greets...
16,499368931367608320,support,[@clydetheslyde @moiskd Imagine if Ferguson fo...
...,...,...,...
2121,581075660690575360,support,[@marcepa49 The autopilot wasn't set to 100 fe...
2128,581293286268129280,support,[@daxtonbrown @ItsJustJill. It took long enoug...
2132,581298086246395904,support,[@daxtonbrown @AnewTrackrecord @ShamNaarai I w...
2136,581386094337474560,support,[RT @khjelmgaard: German media reporting #Andr...


In [7]:
# Separate the tweet data as per their SDQC labels for text summarization into 4 csv files respectively

support_df = (df1[(df1['label'] == 'support')])
support_df.to_csv('support_tweets.csv')

deny_df = (df1[(df1['label'] == 'deny')])
deny_df.to_csv('deny_tweets.csv')

query_df = (df1[(df1['label'] == 'query')])
query_df.to_csv('query_tweets.csv')

comment_df = (df1[(df1['label'] == 'comment')])
comment_df.to_csv('comment_tweets.csv')


REPLY TEXT SUMMARIZATION FROM REPLIES PER SOURCE TWEET

In [8]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [9]:
model_name = "snrspeaks/t5-one-line-summary"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

In [10]:
# function to generate a summary of reply tweets per source tweet

def summarize_tweet_transformers(input_df):
  list_of_generated_reply = []
  
  for rowIndex, row in input_df.iterrows(): #iterate over rows
    # print ('in rowIndex: ' + str(rowIndex))
    reply_abstract = """"""
    for columnIndex, value in row.items():
      if columnIndex == 'text_list':
        
        value = value.strip('][').split(', ')
        # print(type(value))
        # print(value)
        for line in value:
          reply_abstract += line + '\n'

        input_ids = tokenizer.encode(
         "summarize: " + reply_abstract, return_tensors="pt", add_special_tokens=True)
  
        generated_ids = model.generate(
          input_ids=input_ids,
          num_beams=5,
          max_length=100,
          repetition_penalty=2.5,
          length_penalty=1,
          early_stopping=True,
          num_return_sequences=3,)

        predicted_summary = [ 
                       tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                       for g in generated_ids
                       ]
        summarized_reply = predicted_summary[0]
        print(summarized_reply)
        list_of_generated_reply.append(summarized_reply)

  return list_of_generated_reply

# SUPPORT TWEETS

In [11]:
# support_df
import pandas as pd

support_path = '/content/support_tweets.csv'
support_df = pd.read_csv(support_path)

list_of_generated_reply = summarize_tweet_transformers(support_df)
print('type of list_of_reply: ' + str(type(list_of_generated_reply)))
# list_of_generated_reply = list_of_reply.strip('][').split(', ')

support_df['generated_reply'] = pd.Series(list_of_generated_reply)
print(support_df.dtypes)

'Mike Brown was staying with his grandmother for the summer who lived in the community'
'Witness: Police stopped Mike Brown after yelling at him to walk on sidewalk
'@Supreme__Power @Michael Skolnik You so edgy
The line of police cars with high beams on #Ferguson
Footprints of the #Ferguson protests
"Hope to god"
'St. Louis Co. Police tell me of a man who pointed handgun at Chambers &amp; Sheffingdell at about 1 a.m. Man in critical
A woman in #Ferguson was shot in head last night and released this selfie
'@Michael Skolnik Shooting Related to Protesting
A RePUBLICAN POLITICIANS INFOMERCIAL
Where's Al Jesse?
'@PhilSerrin Uh Sorry
'Name of #Ferguson cop expected to be released by 9 ET police chief'
Darren Wilson is the one shot to kill Michael Brown
'URGENT: Darren Wilson is the police officer who shot Mike Brown
Why didn't he get out of the car?
Michael Brown was a Suspect in a Robbery
Unarmed teen's first: killing unarmed teen his first
'Police reports released today indicate Mike Brow

Token indices sequence length is longer than the specified maximum sequence length for this model (749 > 512). Running this sequence through the model will result in indexing errors


Several hostages freed at Jewish supermarket in Paris
Several hostages freed at Jewish supermarket in Paris
Hostage-taker in supermarket siege killed reports say
'@ensowi @France24_en Yes but think it was too dangerous'
'Coup in #Russia?
'US Embassy #Moscow cables talk about an operation planned by "dissidents" in #Russia to "overthrow Putin"
Had Boris Nemtsov been killed by Ramzan Kaladyrov?
Swiss Rumors: Putin absence due to girlfriend Alina giving birth in Ticino
What makes us think the baby story may be true?
"@MailOnline never!!"
Comment on 'Coup?'
'Unformed Russian Embassy staff in London have left for Putin HAS DIED!'
'@AdaptToReality @batchelorshow Yeah
'@TAW3333 Emm I never tagged the user
"Putin reappears on television"
'Very good on #Putin coup by @CoalsonR: Three scenarios for a succession in Russia
Looking for Narnia with Vladimir Putin
Germanwings Airbus A320 crashes in French Alps near Digne
'BREAKING: 148 passengers were on board #GermanWings Airbus A320 which has crash

In [12]:
support_df

Unnamed: 0.1,Unnamed: 0,in_reply_to_status_id_str,label,text_list,generated_reply
0,1,498280126254428160,support,['“@MichaelSkolnik: Mike Brown was staying wit...,'Mike Brown was staying with his grandmother f...
1,3,498430783699554305,support,['@MichaelSkolnik @AC_BOwen #Cleveland @CivilR...,'Witness: Police stopped Mike Brown after yell...
2,5,498433698149056513,support,['@Supreme___Power @MichaelSkolnik You so edgy.'],'@Supreme__Power @Michael Skolnik You so edgy
3,12,499366666300846081,support,"[""Line of police cars with high beams on greet...",The line of police cars with high beams on #Fe...
4,16,499368931367608320,support,"[""@clydetheslyde @moiskd Imagine if Ferguson f...",Footprints of the #Ferguson protests
...,...,...,...,...,...
422,2121,581075660690575360,support,"[""@marcepa49 The autopilot wasn't set to 100 f...","""It's a mystery, I can't tell you what happened"""
423,2128,581293286268129280,support,['@daxtonbrown @ItsJustJill. It took long enou...,Can any muslim be trusted near an airplane now?
424,2132,581298086246395904,support,"[""@daxtonbrown @AnewTrackrecord @ShamNaarai I ...",I wouldn't put anybody on a bus with a muslim ...
425,2136,581386094337474560,support,"[""RT @khjelmgaard: German media reporting #And...",RT @khjelmgaard: German media reporting #Andre...


In [13]:

support_df.to_csv('temp_support.csv')

s2 = pd.read_csv('/content/support_tweets.csv')
# new_support_df = pd.merge(support_df, s2)
# support_df[''].to_csv('support_input.csv')

In [14]:
df_temp = df.groupby(by=['in_reply_to_status_id_str', 'label'])['text'].apply(list).reset_index(name='text_list')
df_temp

Unnamed: 0,in_reply_to_status_id_str,label,text_list
0,498280126254428160,comment,"[@MichaelSkolnik -- wow, @MichaelSkolnik Unbel..."
1,498280126254428160,support,[“@MichaelSkolnik: Mike Brown was staying with...
2,498430783699554305,comment,[@MichaelSkolnik cool! Darkskinned man is a cr...
3,498430783699554305,support,[@MichaelSkolnik @AC_BOwen #Cleveland @CivilRi...
4,498432131669192704,comment,[@Agent_Kindi&lt;&lt;&lt;&lt; @SecretService I...
...,...,...,...
2138,581473088249958400,comment,[@jjauthor @gatewaypundit. I knew it! Those cr...
2139,581473088249958400,query,[@jjauthor @gatewaypundit Isn't Islam a wonder...
2140,581473088249958400,support,[@jjauthor @gatewaypundit i knew that scarf he...
2141,581573165492523008,deny,[@EdWardMDBlog @USATODAY @khjelmgaard all rumo...


In [15]:
# parent_id_tweet_df = df[['in_reply_to_status_id_str', 'parent_tweet_text']]
# parent_id_tweet_df = parent_id_tweet_df.drop_duplicates()
# parent_id_tweet_df

parent_id_tweet_dict = dict(zip(df.in_reply_to_status_id_str, df.parent_tweet_text))
parent_id_tweet = {'id':(list(parent_id_tweet_dict.keys())), 'parent_tweet':(list(parent_id_tweet_dict.values()))}
pt = pd.DataFrame.from_dict(parent_id_tweet)

# df_temp['source_tweet'] = df['in_reply_to_status_id_str'].map(parent_id_tweet_dict)
# df_temp

support_df['source_tweet'] = support_df['in_reply_to_status_id_str'].map(pt.set_index('id')['parent_tweet'])
support_df

Unnamed: 0.1,Unnamed: 0,in_reply_to_status_id_str,label,text_list,generated_reply,source_tweet
0,1,498280126254428160,support,['“@MichaelSkolnik: Mike Brown was staying wit...,'Mike Brown was staying with his grandmother f...,Every 28 hours a black male is killed in the U...
1,3,498430783699554305,support,['@MichaelSkolnik @AC_BOwen #Cleveland @CivilR...,'Witness: Police stopped Mike Brown after yell...,.@AP I demand you retract the lie that people ...
2,5,498433698149056513,support,['@Supreme___Power @MichaelSkolnik You so edgy.'],'@Supreme__Power @Michael Skolnik You so edgy,@MichaelSkolnik @Agent_Kindi @Supreme___Power ...
3,12,499366666300846081,support,"[""Line of police cars with high beams on greet...",The line of police cars with high beams on #Fe...,"Currently the #FoxNews website has zero, repea..."
4,16,499368931367608320,support,"[""@clydetheslyde @moiskd Imagine if Ferguson f...",Footprints of the #Ferguson protests,St. Louis Co Police tell me ofcr shot a man wh...
...,...,...,...,...,...,...
422,2121,581075660690575360,support,"[""@marcepa49 The autopilot wasn't set to 100 f...","""It's a mystery, I can't tell you what happened""",@Minus777 @flightradar24 @isobelroe Look at th...
423,2128,581293286268129280,support,['@daxtonbrown @ItsJustJill. It took long enou...,Can any muslim be trusted near an airplane now?,Germanwings co-pilot had serious depressive ep...
424,2132,581298086246395904,support,"[""@daxtonbrown @AnewTrackrecord @ShamNaarai I ...",I wouldn't put anybody on a bus with a muslim ...,@daxtonbrown @ShamNaarai @AnewTrackrecord a mu...
425,2136,581386094337474560,support,"[""RT @khjelmgaard: German media reporting #And...",RT @khjelmgaard: German media reporting #Andre...,GERMAN NEWS REPORT: Co-Pilot of Germanwings Ai...


In [16]:
support_df[['source_tweet', 'generated_reply']].to_csv('input_support.csv')

# TRAINING THE GENERTAED I/P FILE FOR SUPPORT

In [17]:
!pip install simplet5

Collecting simplet5
  Downloading simplet5-0.1.4.tar.gz (7.3 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.8 MB/s 
Collecting transformers==4.16.2
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 41.7 MB/s 
[?25hCollecting pytorch-lightning==1.5.10
  Downloading pytorch_lightning-1.5.10-py3-none-any.whl (527 kB)
[K     |████████████████████████████████| 527 kB 49.9 MB/s 
[?25hCollecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 45.0 MB/s 
Collecting setuptools==59.5.0
  Downloading setuptools-59.5.0-py3-none-any.whl (952 kB)
[K     |████████████████████████████████| 952 kB 41.7 MB/s 
Collecting torchmetrics>=0.4.1
  Downloading torchmet

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split

path = "/content/input_support.csv"
df = pd.read_csv(path)


# simpleT5 needs dataframe to have 2 columns: "source_text" and "target_text"
df = df.rename(columns={"generated_reply":"target_text", "source_tweet":"source_text"})
df = df[['source_text', 'target_text']]


df['source_text'] = "summarize: " + df['source_text']
df

Unnamed: 0,source_text,target_text
0,summarize: Every 28 hours a black male is kill...,'Mike Brown was staying with his grandmother f...
1,summarize: .@AP I demand you retract the lie t...,'Witness: Police stopped Mike Brown after yell...
2,summarize: @MichaelSkolnik @Agent_Kindi @Supre...,'@Supreme__Power @Michael Skolnik You so edgy
3,summarize: Currently the #FoxNews website has ...,The line of police cars with high beams on #Fe...
4,summarize: St. Louis Co Police tell me ofcr sh...,Footprints of the #Ferguson protests
...,...,...
422,summarize: @Minus777 @flightradar24 @isobelroe...,"""It's a mystery, I can't tell you what happened"""
423,summarize: Germanwings co-pilot had serious de...,Can any muslim be trusted near an airplane now?
424,summarize: @daxtonbrown @ShamNaarai @AnewTrack...,I wouldn't put anybody on a bus with a muslim ...
425,summarize: GERMAN NEWS REPORT: Co-Pilot of Ger...,RT @khjelmgaard: German media reporting #Andre...


In [19]:
training_df, test_df = train_test_split(df, test_size=0.2)
training_df.shape, test_df.shape

((341, 2), (86, 2))

In [20]:
from simplet5 import SimpleT5

model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")
model.train(train_df=training_df[:5000],
            eval_df=test_df[:100], 
            source_max_token_len=128, 
            target_max_token_len=50, 
            batch_size=8, max_epochs=3, use_gpu=True)

Global seed set to 42


Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
  copyfile(self.vocab_file, out_vocab_file)


Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: /content/lightning_logs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [21]:
# let's load the trained model for inferencing:
model.load_model("t5","/content/outputs/simplet5-epoch-2-train-loss-2.8294-val-loss-3.2814", use_gpu=True)


text_to_summarize = """
@FRANCE24: #BREAKING - At least five hostages in Paris kosher supermarket: (AFP) https://t.co/xFn05puuBm #Vincennes http://t.co/WU3kmq957e
RT @FRANCE24 #BREAKING - At least five hostages in Paris kosher supermarket: (AFP) https://t.co/ZF6TGUkOdn #Vincennes http://t.co/501nakgVKN
@FRANCE24 @HarrietBaldwin It's time 4 West 2 make formal declaration of war vs Islamic Terrorists but is there country with courage 2 do so?
@FRANCE24 is this all part of a plan .. a spread of attacks by different associates  to cause maximum confusion ..playing hostages . Max PR?
"""

summarized_text = model.predict(text_to_summarize)[0]
summarized_text

'@FRANCE24: #BREAKING - At least five hostages in Paris kosher supermarket: (AFP) https://t.co/ZF6TGUkOdn'

In [None]:
# sample text --> Steve Jobs Wanted Original iPhone to Have No SIM Card Slot, Says Former iPod VP: Rumors have suggested Apple could soon release an iPhone without a physical SIM card slot, and it turns out that if that's accurate, Apple would be… https://macrumors.com/2022/05/13/original-iphone-was-meant-to-be-simless/?utm_source=dlvr.it&utm_medium=twitter #Apple #Mac #Rumors

In [22]:
#BLEU_SCORE

from nltk.translate.bleu_score import sentence_bleu

ref_list = text_to_summarize.splitlines()
reference = [i.split() for i in ref_list]


candidate = summarized_text.split()

print('Individual 1-gram: %f' % sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))
print('Individual 2-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)))
print('Individual 3-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 1, 0)))
print('Individual 4-gram: %f' % sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)))

score = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
print(score)

Individual 1-gram: 0.857404
Individual 2-gram: 0.857404
Individual 3-gram: 0.857404
Individual 4-gram: 0.857404
0.8574039191604413
