In [618]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [619]:
import numpy as np
import pandas as pd
import json

from config import upload_folder, allowed_extensions, open_api_mode, jupyter_url, openai_key
from utils import read_code, response, update_prompt, prompt_list_len, read_codev1, read_output_from_nb, write_code
from database import Database

# show unlimited pandas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)

# show unlimited numpy
np.set_printoptions(threshold=np.inf)

# do not show backtrace in python error
import sys
sys.tracebacklimit = 0

In [625]:
prompt_list = [None] * 100

instruction1 = '''You are the python code generation for a data science problem.
Please generate the next code after "A: <code>"
'''

instruction2 = '''You are the python code generation for a data science problem.
Please always generate the full training code given the previous code and the prompt.
'''

instruction = instruction1
method = 1
version = 3

problem = '''Problem Description:
'''

ans = '''
A:
<code>
'''

data_file = ''
nb_file = ''

In [626]:
def prompt_preprocessing(prompt):
    dataset_metadata = '\nDataset Information:\n'
    data_values = '\nData Values of {}:\n'
    
    prompt_support = ''
        
    if "@datafile" in prompt:
        prompt = prompt.replace("@datafile", "").strip()
        prompt_support += 'Data file name: {}.\n'.format(data_file)
        
    if "@metadata" in prompt:
        prompt = prompt.replace("@metadata", "").strip()
        dataset_metadata += get_dataset_metadata(data_file)
        prompt_support += dataset_metadata + '\n------------\n'
        
    if "@data-values" in prompt:
        instr = prompt[prompt.index("@data-values"):]
        prompt = prompt.replace(instr, "").strip()
        col_name = instr[instr.index("/")+1:]
        col_names = [col for col in col_name.split(",")]
        col_names = ','.join(col_names)
        data_values = data_values.format(col_names)
        data_values += get_dataset_values(data_file, col_names)
        prompt_support += data_values
        
    return prompt, prompt_support

def get_dataset_metadata(data_file):
    df = pd.read_csv(data_file)
    import io
    buf = io.StringIO()
    df.info(buf=buf)
    metadata = buf.getvalue()
    return metadata

def get_dataset_values(data_file, col_name):
    df = pd.read_csv(data_file)
    # get unique values of col_name
    values = sorted(df[col_name].unique())
    #values = values[np.random.choice(len(values), min(len(values), 20), replace=False)]
    values = values[-10:]
    return str(values)


def exec_code(code):
    import warnings
    warnings.filterwarnings("ignore")
    import sys
    from io import StringIO
    from contextlib import redirect_stdout
    
    code = '''{}'''.format(code)

    f = StringIO()
    error = False
    with redirect_stdout(f):
        try:
            exec(code)
            out_value = f.getvalue()
        except Exception as e:
            error = True
            out_value = "{}: {}".format(type(e).__name__, e)
    
    return out_value, error

def print_output(output):
    import warnings
    warnings.filterwarnings("ignore")
    import sys
    from io import StringIO
    from contextlib import redirect_stdout
    
    code = 'print("".join({}))'.format(output)
    f = StringIO()
    with redirect_stdout(f):
        exec(code)
        out_value = f.getvalue()
    
    return out_value

def save_generation(prompt_id, prompt, code, output):
    db = Database()
    gen_data = {'prompt_id': prompt_id, 'prompt': prompt, 'code': code, 'output': output}
    db.insert_or_update_gen(kernel_id, version, gen_data)
    
def save_task_1step(step_id, out):
    import json
    t = json.loads(out)
    key = list(t.keys())[0]
    t = t[key]
    if t.get('column_names'):
        t['column names'] = t['column_names']
    column_names = t['column names']
    if not isinstance(column_names, list):
        if 'all' in column_names.lower():
            column_names = 'All'
        column_names = [column_names]
        t['column names'] = column_names
    
    db = Database()
    task = {'Step': step_id, 
                 'Task Name': t['name'], 
                 'Column Names': ','.join(t['column names']),
                 'Method': t['strategy'],
                 'Reason': t['reason']
                }
    db.insert_or_update_task(kernel_id, version, task)

In [627]:
def generate_code(prompt_content, prompt_id=0):
    promptlist_len = prompt_list_len(prompt_list)
    if method == 2:
        code, output = read_code(prompt_list, max(0, promptlist_len-1))
        code += '\n----------\n'
        output += '\n----------\n'
    elif method == 1:
        code = read_codev1(prompt_list, prompt_id-1)
        #output = read_output_from_nb(nb_file, prompt_list, prompt_id-1)
        #output += '\n----------\n'
        #output = 'Output:\n' + output
        output = ''

    # add datafile and data metadata to prompt content (at the first time)
    if promptlist_len == 0:
        prompt_content += "@datafile"
    # get prompt support information
    prompt_content, prompt_support = prompt_preprocessing(prompt_content)

    # create prompt for chatgpt
    #prompt = prompt_support + '\n' + code + '\n' + output + '\n' + instruction + prompt_content + ans
    if method == 2:
        prompt = code + prompt_support + '\n' + instruction + prompt_content + ans
    elif method == 1:
        prompt = prompt_support + '\n' + instruction + prompt_content + ans + code
    print(prompt)

    # call openai
    out = response(openai_key, prompt, temperature=0.2)
    out = out.replace('<code>', '').strip()
    print('>>', out)
    
    # save code generation (no output)
    save_generation(prompt_id, prompt_content, out, '')

    # execute code
    if method == 2:
        out_value, error = exec_code(out)
        print(error, out_value)
        # update prompt list
        update_prompt(prompt_list, prompt_id, instruction, problem, ans, prompt_content, out, out_value)
    
    
    if method == 1:
        # update prompt list
        out_value = ''
        update_prompt(prompt_list, prompt_id, instruction, problem, ans, prompt_content, out, out_value)
        # get the full code
        #code = read_codev1(prompt_list, prompt_id+1)
        #out_value, error = exec_code(code)
        #print(error, out_value)
        # update prompt list
        update_prompt(prompt_list, prompt_id, instruction, problem, ans, prompt_content, out, out_value)
        # write code
        write_code(nb_file, prompt_list, prompt_id)



In [637]:
def generate_steps(prompt_content, show_output=True, show_code=True, step_id=None):
    if not step_id:
        step_id = prompt_list_len(prompt_list)
        print('step_id', step_id)
    prompt_id = max(0, step_id-1)
    if method == 2:
        code, output = read_code(prompt_list, prompt_id)
        code += '\n----------\n'
        output += '\n----------\n'
    elif method == 1:
        '''code = read_codev1(prompt_list, max(0, prompt_id))
        code += '\n----------\n'
        _, output = read_code(prompt_list, max(0, prompt_id-1))
        output += '\n----------\n'
        '''
        #code = read_codev1(prompt_list, max(0, prompt_id))
        output = read_output_from_nb(nb_file, prompt_list, prompt_id)
        # update output generation
        save_generation(prompt_id, prompt_list[prompt_id]['prompt'], prompt_list[prompt_id]['generated_code'], output)
        
        code = read_codev1(prompt_list, prompt_id)
        code += '\n----------\n'
        output += '\n----------\n'
        

    # get prompt support information
    prompt_content, prompt_support = prompt_preprocessing(prompt_content)
    
    code = 'Previous Code:\n' + code
    output = 'Previous Output:\n' + output
    if show_code == False:
        code = ''
    if show_output == False:
        output = ''
        
    # create prompt for chatgpt
    #prompt =  output + prompt_support + '\n' + prompt_content
    prompt = code + output + prompt_content
    print(prompt)

    # call openai
    out = response(openai_key, prompt, temperature=0.7)
    print('>>', out)
    
    # save task
    save_task_1step(step_id, out)
    
    return out


In [629]:
data_file = 'earthquake_data.csv'
nb_file = '../kaggle/kernels_train/adambyrne/earthquake-alert-classifier/workflow.ipynb'
db = Database()
kernel_id = db.get_kernel_by_ref('adambyrne/earthquake-alert-classifier')
print(kernel_id)
if kernel_id == 0:
    raise "Error"
prompt = 'Load data from file. Show data information and few rows.'
prompt_id=0
generate_code(prompt, prompt_id)
# first task
step_id=0
out = '''{
  "step": {
    "name": "Load data",
    "column names": ["All"],
    "strategy": "Load data from file. Show data information and few rows.",
    "reason": ""
  }
}'''
save_task_1step(step_id, out)

90
Data file name: earthquake_data.csv.

You are the python code generation for a data science problem.
Please generate the next code after "A: <code>"
Load data from file. Show data information and few rows.
A:
<code>

>> import pandas as pd

# Load data from file
data = pd.read_csv("earthquake_data.csv")

# Show data information
print(data.info())

# Show few rows of data
print(data.head())


In [691]:
prompt = "The problem is earthquake alert classification. You are an expert in earthquake domain. Based on the problem, the dataset, previous code, previous output, "\
"list 1 new method to fill missing value for country in the following json format: step {name, column names, strategy, reason}."\
#" The previous output shows no missing values."
task = generate_steps(prompt, show_output=True, show_code=True, step_id=None)


step_id 4
Previous Code:
import pandas as pd

# Load data from file
data = pd.read_csv("earthquake_data.csv")

# Show data information
print(data.info())

# Show few rows of data
print(data.head())
# Check for missing values in latitude and longitude columns
missing_values = data[['latitude', 'longitude']].isnull().sum()
print("Missing values in latitude and longitude columns:")
print(missing_values)

# Import geopy library for geocoding
from geopy.geocoders import Nominatim

# Create geocoder object
geolocator = Nominatim(user_agent="my_app")

# Function to get country from latitude and longitude
def get_country(lat, lon):
    location = geolocator.reverse((lat, lon), exactly_one=True)
    if location is not None:
        return location.raw['address'].get('country', None)
    else:
        return None

# Impute missing values in country column using latitude and longitude
data['country'] = data.apply(lambda row: get_country(row['latitude'], row['longitude']) if pd.isnull(row['country

In [679]:
import json
t = json.loads(task)
key = list(t.keys())[0]
t = t[key]
if t.get('column_names'):
    t['column names'] = t['column_names']
column_names = t['column names']
if not isinstance(column_names, list):
    column_names = [column_names]
    t['column names'] = column_names
#t['strategy'] = 'From the columns latitude and longitude to find the missing values by geodata'
p = '{}: {} for columns {}'.format(t['name'], t['strategy'], ', '.join(t['column names']))
p

'Remove Missing Values for Alert: Dropping rows for columns alert'

In [None]:
prompt = "The problem is house price prediction. Based on the problem, previous code, previous output, "\
"list All necessary steps for data preparation in the following json format: step {name, strategy, reason}"
tasks = generate_steps(prompt, show_output=True, show_code=True, step_id=1)


In [None]:
import json
t = json.loads(tasks)
steps = t['steps']
for s in steps:
    if s.get('column_names'):
        s['column names'] = s['column_names']
    p = '{}: {} for columns {}'.format(s['name'], s['strategy'], ', '.join(s['column names']))
    print(p)

In [680]:
#prompt = "Model Training: Train a regression model using the selected features for columns gestation, height, weight, smoke. Use R2 score"
prompt_id = 3
generate_code(p, prompt_id)


You are the python code generation for a data science problem.
Please generate the next code after "A: <code>"
Remove Missing Values for Alert: Dropping rows for columns alert
A:
<code>
import pandas as pd

# Load data from file
data = pd.read_csv("earthquake_data.csv")

# Show data information
print(data.info())

# Show few rows of data
print(data.head())
# Check for missing values in latitude and longitude columns
missing_values = data[['latitude', 'longitude']].isnull().sum()
print("Missing values in latitude and longitude columns:")
print(missing_values)

# Import geopy library for geocoding
from geopy.geocoders import Nominatim

# Create geocoder object
geolocator = Nominatim(user_agent="my_app")

# Function to get country from latitude and longitude
def get_country(lat, lon):
    location = geolocator.reverse((lat, lon), exactly_one=True)
    if location is not None:
        return location.raw['address'].get('country', None)
    else:
        return None

# Impute missing value

In [617]:
step_id = prompt_list_len(prompt_list)
print('step_id', step_id)
prompt_id = max(0, step_id-1)
if method == 2:
    code, output = read_code(prompt_list, prompt_id)
    code += '\n----------\n'
    output += '\n----------\n'
elif method == 1:
    output = read_output_from_nb(nb_file, prompt_list, prompt_id)
    # update output generation
    save_generation(prompt_id, prompt_list[prompt_id]['prompt'], prompt_list[prompt_id]['generated_code'], output)

step_id 8


In [74]:
prompt_list[1]

{'instruction': 'You are the python code generation for a data science problem.\n',
 'problem': 'Problem Description:\n',
 'ans': '\nA:\n<code>\n',
 'prompt': 'Check for missing values',
 'generated_code': '# Check for missing values\nprint(data.isnull().sum())\n',
 'output': '\nYear                          0\nMake                          0\nModel                         0\nCondition                     0\nPrice                         0\nConsumer_Rating               0\nConsumer_Review_#             0\nExterior_Color               11\nInterior_Color               11\nDrivetrain                   11\nMPG                        1485\nFuel_Type                    11\nTransmission                 11\nEngine                       11\nVIN                          11\nStock_#                      11\nMileage                      11\nComfort_Rating              552\nInterior_Design_Rating      552\nPerformance_Rating          552\nValue_For_Money_Rating      552\nExterior_Styling_Rating    