In [1]:
import sys
#Path to Trane for imports
sys.path.append('/Users/Alexander/Documents/Trane__HDI_REPO')
import pandas as pd
import trane
import json

In [2]:
def file_to_table_meta(filepath):
    return trane.TableMeta(json.loads(open(filepath).read()))

In [3]:
#IMPORT DATA
taxi_data_df = pd.read_csv('NYC Taxi/taxi.csv')
taxi_table_meta = file_to_table_meta('NYC Taxi/taxi_meta.json')

yelp_review_data_df = pd.read_csv('Yelp Reviews/yelp_review_sampled.csv')
yelp_table_meta = file_to_table_meta('Yelp Reviews/yelp_review_meta.json')

github_data_df = pd.read_csv('Github/github_archive.csv')
github_table_meta = file_to_table_meta('Github/github_meta.json')

In [4]:
NUM_PROBLEMS_TO_GENERATE = 10

In [12]:
def generate_probs_and_nl(entity_id_column,
                            label_generating_column,
                            time_column,
                            table_meta,
                            filter_column):
    generator = trane.PredictionProblemGenerator(table_meta, entity_id_column, label_generating_column, time_column, filter_column)
    probs = []
    for idx, prob in enumerate(generator.generate()):
        probs.append(prob)
        if idx + 1 == NUM_PROBLEMS_TO_GENERATE:
            break
    prediction_problems_json = trane.prediction_problems_to_json_file(
        probs, table_meta, entity_id_column, label_generating_column, time_column, "prediction_problems.json")

    nl_descrips = trane.generate_nl_description(
        probs, table_meta, entity_id_column, label_generating_column, time_column, trane.ConstantIntegerCutoffTimes(0))
    return probs, nl_descrips

# Taxi Data
Below, is the code to generate prediction problems and natural language descriptions from the taxi dataset.

In [20]:
entity_id_column = 'id'
label_generating_column = 'passenger_count'
time_column = 'pickup_datetime'
filter_column = 'id'
table_meta = taxi_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

['For each id, predict the first passenger_count, after pickup_datetime 0.', 'For each id, predict the first passenger_count, with id greater than 0, after pickup_datetime 0.', 'For each id, predict the first passenger_count, with id equal to 0, after pickup_datetime 0.', 'For each id, predict the first passenger_count, with id not equal to 0, after pickup_datetime 0.', 'For each id, predict the first passenger_count, with id less than 0, after pickup_datetime 0.', 'For each id, predict the first passenger_count is greater than 0, after pickup_datetime 0.', 'For each id, predict the first passenger_count is greater than 0, with id greater than 0, after pickup_datetime 0.', 'For each id, predict the first passenger_count is greater than 0, with id equal to 0, after pickup_datetime 0.', 'For each id, predict the first passenger_count is greater than 0, with id not equal to 0, after pickup_datetime 0.', 'For each id, predict the first passenger_count is greater than 0, with id less than 0

# Yelp Data
Below, is the code to generate prediction problems and natural language descriptions from the yelp dataset.

In [21]:
entity_id_column = 'user_id'
label_generating_column = 'stars'
time_column = 'date'
filter_column = 'user_id'
table_meta = yelp_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

['For each user_id, predict the first stars, after date 0.', 'For each user_id, predict the first stars, with user_id greater than 0, after date 0.', 'For each user_id, predict the first stars, with user_id equal to 0, after date 0.', 'For each user_id, predict the first stars, with user_id not equal to 0, after date 0.', 'For each user_id, predict the first stars, with user_id less than 0, after date 0.', 'For each user_id, predict the first stars is greater than 0, after date 0.', 'For each user_id, predict the first stars is greater than 0, with user_id greater than 0, after date 0.', 'For each user_id, predict the first stars is greater than 0, with user_id equal to 0, after date 0.', 'For each user_id, predict the first stars is greater than 0, with user_id not equal to 0, after date 0.', 'For each user_id, predict the first stars is greater than 0, with user_id less than 0, after date 0.']


# Github Data
Below, is the code to generate prediction problems and natural language descriptions from the github dataset.

In [22]:
entity_id_column = 'actor'
label_generating_column = 'repo'
time_column = 'created_at'
filter_column = 'user_id'
table_meta = github_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

['For each actor, predict the first repo, after created_at 0.', 'For each actor, predict the first repo, with user_id greater than 0, after created_at 0.', 'For each actor, predict the first repo, with user_id equal to 0, after created_at 0.', 'For each actor, predict the first repo, with user_id not equal to 0, after created_at 0.', 'For each actor, predict the first repo, with user_id less than 0, after created_at 0.', 'For each actor, predict the first repo is greater than 0, after created_at 0.', 'For each actor, predict the first repo is greater than 0, with user_id greater than 0, after created_at 0.', 'For each actor, predict the first repo is greater than 0, with user_id equal to 0, after created_at 0.', 'For each actor, predict the first repo is greater than 0, with user_id not equal to 0, after created_at 0.', 'For each actor, predict the first repo is greater than 0, with user_id less than 0, after created_at 0.']
