In [None]:
!pip install --upgrade transformers sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### https://huggingface.co/tuner007/pegasus_paraphrase

In [None]:
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [None]:
def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],
                    truncation=True,
                    padding='longest',max_length=60, 
                    return_tensors="pt").to(torch_device)

  translated = model.generate(**batch,
                              max_length=60,
                              num_beams=num_beams, 
                              num_return_sequences=num_return_sequences,
                              temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [None]:
#/content/drive/MyDrive/SMS_train.csv
#/content/drive/MyDrive/Sentiment.csv
#/content/drive/MyDrive/news_articles.csv
df = pd.read_csv("/content/drive/MyDrive/SMS_train.csv",encoding='cp1252')
df.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam


In [None]:
df = df[["Message_body","Label"]]

In [None]:
df = df[:500]

In [None]:
import nltk
import regex as re
import string
nltk.download('punkt')
nltk.download('stopwords')
def clean_text(text):
    text = text.lower()                                  # lower-case all characters
    text =  re.sub(r'@\S+', '',text)                     # remove twitter handles
    text =  re.sub(r'http\S+', '',text)                  # remove urls
    text =  re.sub(r'pic.\S+', '',text) 
    text =  re.sub(r"[^a-zA-Z+']", ' ',text)             # only keeps characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ')      # keep words with length>1 only
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.tokenize.word_tokenize(text)
    stopwords = nltk.corpus.stopwords.words('english')   # remove stopwords
    text = " ".join([i for i in words if i not in stopwords and len(i)>2])
    text= re.sub("\s[\s]+", " ",text).strip()            # remove repeated/leading/trailing spaces
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df['Message_body'] = df["Message_body"].apply(clean_text)

In [None]:
df.rename(columns={'Message_body': 'Message', 'Label': 'Category'}, inplace=True)

In [None]:
df

Unnamed: 0,Message,Category
0,rofl true name,Non-Spam
1,guy bitching acted like interested buying some...,Non-Spam
2,pity mood suggestions,Non-Spam
3,going esplanade home,Non-Spam
4,time tried contact pound prize claim easy call...,Spam
...,...,...
495,win shopping spree every week starting play te...,Spam
496,different styles,Non-Spam
497,alfie moons children need song mob tell txt to...,Spam
498,hai dear friends new amp present number rajith...,Non-Spam


In [None]:
def generator_function(train_df):
    train = train_df
    train = train[['Message', 'Category']]
    train['Message'] = train['Message'].apply(get_response, num_return_sequences=1,num_beams = 1)
    generated = train.explode('Message')
    generated = generated.dropna()
    generated = generated.drop_duplicates()
    return generated

In [None]:
df_aug=generator_function(df)
df_aug.to_csv('/content/drive/MyDrive/EnsembleLearningProject/sms_aug_pegasus.csv', header=None,index = False)