In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/Shared drives/SI630'

Mounted at /content/drive
/content/drive/Shared drives/SI630


In [None]:
!pip install simpletransformers

In [None]:
import warnings

import pandas as pd

def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df


def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [None]:
import os
from datetime import datetime
import logging
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

In [None]:
# Google Data
train_df = pd.read_csv("data/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("data/dev.tsv", sep="\t").astype(str)
test_df = pd.read_csv("data/test.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]
test_df = test_df.loc[test_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
test_df = test_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]
test_df = test_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"
test_df["prefix"] = "paraphrase"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Baseline Model

In [None]:
train_df.head()

Unnamed: 0,input_text,target_text,prefix
1,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,paraphrase
3,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,paraphrase
4,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,paraphrase
5,William Henry Henry Harman was born on 17 Febr...,"William Henry Harman was born in Waynesboro , ...",paraphrase
7,With a discrete amount of probabilities Formul...,Given a discrete set of probabilities formula ...,paraphrase


In [None]:
import random 
def baseline_model(input_text):
  lst=input_text.split()
  
  paraphrase_lst =[]
  for i in lst:
    id= (random.randrange(0,len(lst)))
    paraphrase_lst.append(lst[id])
  paraphrase_text = ' '.join(paraphrase_lst)
  return paraphrase_text

In [None]:
baseline_model('Wikipedia was launched on January 15, 2001, and was created by Jimmy Wales and Larry Sanger.')

'was launched launched Larry and and Sanger. 2001, January was Jimmy on and Wikipedia 2001, on'

# Baseline Model Evaluation

## USE

In [None]:
# !pip install spacy_universal_sentence_encoder

In [None]:
import spacy_universal_sentence_encoder
nlp=spacy_universal_sentence_encoder.load_model('en_use_lg')
def USE(s1, s2):
  s1=nlp(s1)
  s2=nlp(s2)
  similarity = s1.similarity(s2) 
  return similarity
use_lst=[]
s1_lst=test_df['input_text'].tolist()[0:100]
s2_lst=test_df['target_text'].tolist()[0:100]

# para_lst = baseline_model(s1_lst)

for s1,s2 in zip(s1_lst,s2_lst):
  para_s=baseline_model(s1)
  # s2=s2[0]
  # print("Targeted:", s2)
  # print("Paraphrase:", para_s)
  use_lst.append(USE(s2,para_s))

use_score= sum(use_lst)/len(use_lst)
print(use_score)

0.7035438047688306


# Human Evaluation by jokes

In [None]:
def paraphrase(joke):
  # para = model.predict(joke)
  for i in range(len(joke)):
    para = baseline_model(joke[i])
    print("Original:",joke[i])
    print("Paraphrased: ",para)

In [None]:
joke1= ['A kid finds a magical lamp.He rubs the lamp, and a genie appears and says, “What is your first wish?”',
        ' The kid says, “I wish I were rich!” The genie replies, “It is done! What is your second wish, Rich?”.']
paraphrase(joke1)

Original: A kid finds a magical lamp.He rubs the lamp, and a genie appears and says, “What is your first wish?”
Paraphrased:  appears says, rubs wish?” magical a genie lamp.He magical is and and “What finds first genie and lamp, first genie
Original:  The kid says, “I wish I were rich!” The genie replies, “It is done! What is your second wish, Rich?”.
Paraphrased:  What says, The replies, I I The What says, is “It kid genie were What kid “It What “It The


In [None]:
joke2 = ['Three friends stranded on a deserted island find a magic lamp. Inside it is a genie who agrees to grant each friend one wish.',
         ' “I want to go home,” says the first friend. The genie grants her wish.',
         ' “I want to go home, too,” says the second friend. And the genie sends her back home.',
         ' “I’m lonely,” says the third friend. “I sure wish my friends were back here."']
paraphrase(joke2)

Original: Three friends stranded on a deserted island find a magic lamp. Inside it is a genie who agrees to grant each friend one wish.
Paraphrased:  to stranded on to a to wish. Three lamp. it grant a it is island Three genie lamp. is each each island deserted deserted
Original:  “I want to go home,” says the first friend. The genie grants her wish.
Paraphrased:  friend. friend. to first first grants the says wish. “I wish. grants home,” home,”
Original:  “I want to go home, too,” says the second friend. And the genie sends her back home.
Paraphrased:  friend. genie home. genie says to back sends genie “I go genie the back And home, to
Original:  “I’m lonely,” says the third friend. “I sure wish my friends were back here."
Paraphrased:  “I’m lonely,” third wish “I friends says here." says the friend. my here." sure


In [None]:
joke3=['A man walks into a library, approaches the librarian and says, “I’ll have a cheeseburger and fries, please.”',
       'The librarian says, “Sir, you know you’re in a library, right?”',
       '“Sorry,” he whispers. “I’ll have a cheeseburger and fries, please.”']
paraphrase(joke3)

Original: A man walks into a library, approaches the librarian and says, “I’ll have a cheeseburger and fries, please.”
Paraphrased:  and says, a says, a librarian a “I’ll a man a a A library, library, please.” have a
Original: The librarian says, “Sir, you know you’re in a library, right?”
Paraphrased:  library, right?” librarian a The you library, library, The know in
Original: “Sorry,” he whispers. “I’ll have a cheeseburger and fries, please.”
Paraphrased:  he he whispers. “I’ll a please.” a fries, have whispers.
