In [1]:
from dotenv import load_dotenv
import os
load_dotenv()
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

import openai

openai.api_key = os.getenv("OPENAI_KEY")

df = pd.read_csv('../data/raw/train_essays.csv')
prompts = pd.read_csv('../data/external/nbroad/prompts.csv')
fb = pd.read_csv('../data/external/fell/train.csv')
cl = pd.read_csv('../data/external/commonlit-read/train.csv')

persuade = pd.read_csv('../data/external/pesuade/persuade_2.0_human_scores_demo_id_github.csv')

valid_assignments = [
    'Write an explanatory essay to inform fellow citizens about the advantages of limiting car usage. Your essay must be based on ideas and information that can be found in the passage set. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your essay in the space provided.',
    'Write a letter to your state senator in which you argue in favor of keeping the Electoral College or changing to election by popular vote for the president of the United States. Use the information from the texts in your essay. Manage your time carefully so that you can read the passages; plan your response; write your response; and revise and edit your response. Be sure to include a claim; address counterclaims; use evidence from multiple sources; and avoid overly relying on one source. Your response should be in the form of a multiparagraph essay. Write your response in the space provided.',
]

train_assignments = [
       "You have just read the article, 'A Cowboy Who Rode the Waves.' Luke's participation in the Seagoing Cowboys program allowed him to experience adventures and visit many unique places. Using information from the article, write an argument from Luke's point of view convincing others to participate in the Seagoing Cowboys program. Be sure to include: reasons to join the program; details from the article to support Luke's claims; an introduction, a body, and a conclusion to your essay.",
       'In "The Challenge of Exploring Venus," the author suggests studying Venus is a worthy pursuit despite the dangers it presents. Using details from the article, write an essay evaluating how well the author supports this idea. Be sure to include: a claim that evaluates how well the author supports the idea that studying Venus is a worthy pursuit despite the dangers; an explanation of the evidence from the article that supports your claim; an introduction, a body, and a conclusion to your essay.',
       'In the article "Making Mona Lisa Smile," the author describes how a new technology called the Facial Action Coding System enables computers to identify human emotions. Using details from the article, write an essay arguing whether the use of this technology to read the emotional expressions of students in a classroom is valuable.',
       "You have read the article 'Unmasking the Face on Mars.' Imagine you are a scientist at NASA discussing the Face with someone who thinks it was created by aliens. Using information in the article, write an argumentative essay to convince someone that the Face is just a natural landform.Be sure to include: claims to support your argument that the Face is a natural landform; evidence from the article to support your claims; an introduction, a body, and a conclusion to your argumentative essay.",
       'In the article “Driverless Cars are Coming,” the author presents both positive and negative aspects of driverless cars. Using details from the article, create an argument for or against the development of these cars.  Be sure to include: your position on driverless cars; appropriate details from the article that support your position; an introduction, a body, and a conclusion to your argumentative essay.',
]

all_assignments = train_assignments + valid_assignments
persuade = persuade[persuade['assignment'].isin(all_assignments)]

# persuade = persuade.groupby('prompt_name').sample(100, random_state=42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
persuade = persuade[persuade['holistic_essay_score'] > 3]
persuade = persuade.reset_index(drop=True)

In [3]:
persuade.shape

(3607, 14)

In [7]:
import nltk
from nltk import sent_tokenize
nltk.download('punkt')

folder = '../data/generated/persuade_gpt_patially_rewritten_05'
generated_indexes = []
for id in persuade.essay_id_comp.values:
    if os.path.isfile(f'{folder}/text_{id}.txt'):
        generated_indexes.append(id)
        
remaining_indexes = sorted([i for i in persuade.essay_id_comp.values if i not in generated_indexes])

np.random.shuffle(remaining_indexes)

print(len(generated_indexes))
print(len(remaining_indexes))

1837
1770


[nltk_data] Downloading package punkt to /Users/yevhenii/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
for retry in range(50):
    try:
        for id in tqdm(remaining_indexes):
            row = persuade[persuade['essay_id_comp'] == id]
            assert row.shape[0] == 1
            
            if os.path.isfile(f'{folder}/text_{id}.txt'):
                continue

            text = row.full_text.values[0]
            assignment =  row.assignment.values[0]

            sentences = sent_tokenize(text)

            ratio = 0.5
            n_sentences = int(len(sentences) * ratio)

            sentences_idxs = [i for i in range(len(sentences))]
            random_idxs = sorted(np.random.choice(sentences_idxs, size=n_sentences))
            
            raw_prompt = """Pretend that you are student of 10th grade. Rephrase following text, keep only content text in your response (i.e. remove anything else that is not in rephrased text): {}"""

            for rand_idx in random_idxs:
                prompt = raw_prompt.format(sentences[rand_idx])
                
                messages = [
                    {
                        "role": "system",
                        "content": "You are a helpful assistant."
                    },
                    {
                        "role": "user",
                        "content": prompt
                    }
                ]
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=messages,
                    max_tokens=1000,
                )
                generated_text = response['choices'][0]['message']['content']
                sentences[rand_idx] = generated_text

            new_text = ' '.join(sentences)
            with open(f'{folder}/text_{id}.txt', 'w') as file:
                file.write(new_text)
                
            with open(f'{folder}/oritext_{id}.txt', 'w') as file:
                file.write(text)

    except:
        pass    
    

  0%|          | 0/1770 [00:00<?, ?it/s]

 24%|██▍       | 422/1770 [1:26:51<4:37:28, 12.35s/it] 
 29%|██▊       | 505/1770 [15:41<39:17,  1.86s/it]  
 49%|████▉     | 872/1770 [1:19:57<1:22:20,  5.50s/it]
 52%|█████▏    | 925/1770 [19:24<17:43,  1.26s/it]  
 64%|██████▍   | 1133/1770 [38:04<2:07:31, 12.01s/it]

In [80]:
# for idx in tqdm(persuade.index.values):
#     if os.path.isfile(f'persuade_gpt4_rephrased/text{idx}.txt'):
#         continue
    
#     text = persuade.full_text.loc[idx]
#     assignment = persuade.assignment.loc[idx]
    
#     if assignment not in valid_assignments:
#         continue
    
#     prompt = f"""
#     Pretend that you are student of 10th grade. 
#     Rephrase following text, 
#     remove any titles, text block names (like 'Introduction:', 'Main:', 'Conclusion:', etc.).
#     Your response should not contain any numeration or bullet points, write plain, regular text.
#     Keep the original text length in your response.
#     Text: {text}
#     """
    
#     messages = [
#         {
#             "role": "system",
#             "content": "You are a helpful assistant."
#         },
#         {
#             "role": "user",
#             "content": prompt
#         }
#     ]
        
#     response = openai.ChatCompletion.create(
#         model="gpt-3.5-turbo", 
#         messages=messages,
#         max_tokens=3400,
#     )
    
#     generated_text = response['choices'][0]['message']['content']

#     with open(f'persuade_gpt4_rephrased/text{idx}.txt', 'w') as file:
#         file.write(generated_text)
        
#     with open(f'persuade_gpt4_rephrased/original_text{idx}.txt', 'w') as file:
#         file.write(text)
        
#     with open(f'persuade_gpt4_rephrased/assignment_text{idx}.txt', 'w') as file:
#         file.write(assignment)
        
#     # print(1)

100%|██████████| 700/700 [30:54<00:00,  2.65s/it]  
