In [16]:
import pandas as pd
from datasets import Dataset
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [17]:
train_df = pd.read_csv('./dataset/final/train.csv', usecols=['source', 'label'])
train_df['source'] = train_df['source'].str.lower()
train_df['label'] = train_df['label'].str.lower()
train_ds = Dataset.from_pandas(train_df)

test_df = pd.read_csv('./dataset/final/test.csv', usecols=['source', 'label'])
test_df['source'] = test_df['source'].str.lower()
test_df['label'] = test_df['label'].str.lower()
test_ds = Dataset.from_pandas(test_df)

dev_df = pd.read_csv('./dataset/final/dev.csv', usecols=['source', 'label'])
dev_df['source'] = dev_df['source'].str.lower()
dev_df['label'] = dev_df['label'].str.lower()
dev_ds = Dataset.from_pandas(dev_df)

all_df = pd.concat((train_df, test_df, dev_df))
all_ds = Dataset.from_pandas(all_df)

In [22]:
train_df.shape

(12619, 2)

In [18]:
tokenizer = AutoTokenizer.from_pretrained('./results_seq2seq/', local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained('./results_seq2seq/', local_files_only=True)

In [65]:
from VisAwareTranslation import postprocessing

nl_template_cnt = 0
nl_template_match = 0

for source, label in zip(test_df['source'], test_df['label']):
    input_ids = tokenizer(source, return_tensors="pt", max_length=512, padding=True, truncation=True).input_ids 
    outputs = model.generate(input_ids)
    
    decoded = ''.join(tokenizer.convert_ids_to_tokens(outputs[0])[1:-1]).replace('▁', ' ').strip()
    
    pred = postprocessing(label, decoded)

    nl_template_cnt += 1

    if ' '.join(label.replace('"', "'").split()) == ' '.join(pred.replace('"', "'").split()):
        nl_template_match += 1
    else:
        print(source)
        print(decoded)
        print(label)
        print()

10 . 9 . company_id - int , rank - int , company - str , headquarters - str , main_industry - str , sales_billion - float , profits_billion - float , assets_billion - float , market_value - float | how many companies that are not headquartered in the united states for each main industry ? show me a bar chart , and could you display by the total number from high to low ?
mark bar encoding x main_industry y aggregate count main_industry transform filter main_industry != 'united states' sort y desc
mark bar encoding x main_industry y aggregate count main_industry transform filter headquarters != 'usa' sort y desc

19 . 8 . id - int , name - str , headquarters - str , industry - str , sales_billion - float , profits_billion - float , assets_billion - float , market_value_billion - float | , and order by the y axis in ascending .
mark bar encoding x headquarters y aggregate none market_value_billion transform sort y asc
mark bar encoding x industry y aggregate count industry transform sort 

3 . 8 . cust_id - int , cust_name - str , acc_type - str , acc_bal - int , no_of_loans - int , credit_score - int , branch_id - int , state - str | can you compare the account balances of customers with the letter 'a' in their names using a bar graph , and order by the x axis in desc please .
mark bar encoding x acc_type y aggregate none acc_bal transform filter credit_score = 'a' sort x desc
mark bar encoding x cust_name y aggregate none acc_bal transform filter cust_name like '%a%' sort x desc

9 . 6 . people_id - int , sex - str , name - str , date_of_birth - str , height - float , weight - float | show weight from each date of birth , and i want to rank in asc by the y axis please .
mark bar encoding x sex y aggregate none weight transform sort y asc
mark bar encoding x date_of_birth y aggregate none weight transform sort y asc

15 . 8 . student_id - int , date_of_registration - str , date_of_latest_logon - str , login_name - str , password - str , personal_name - str , middle_name

10 . 9 . company_id - int , rank - int , company - str , headquarters - str , main_industry - str , sales_billion - float , profits_billion - float , assets_billion - float , market_value - float | show the number of companies whose headquarters are not from usa fpr each main industry in a bar chart , show x-axis in ascending order please .
mark bar encoding x main_industry y aggregate count main_industry transform filter main_industry != 'usa fpr' sort x asc
mark bar encoding x main_industry y aggregate count main_industry transform filter headquarters != 'usa' sort x asc

58 . 8 . facid - int , lname - str , fname - str , rank - str , sex - str , phone - int , room - str , building - str | show all the ranks and the number of male and female faculty for each rank in a bar chart , and i want to rank x in asc order please .
mark bar encoding x rank y aggregate count rank transform sort x asc
mark bar encoding x rank y aggregate count rank color sex transform sort x asc

58 . 8 . facid 

50 . 4 . id - int , name - str , dept_name - str , salary - float | find the names and average salaries of all departments whose average salary is greater than 42000 , sort total number in desc order .
mark bar encoding x dept_name y aggregate mean salary transform filter dept_name > 42000 sort y desc
mark bar encoding x dept_name y aggregate mean salary transform sort y desc

4 . 12 . team_id - int , school_id - int , team_name - str , acc_regular_season - str , acc_percent - float , acc_home - str , acc_road - str , all_games - str , all_games_percent - float , all_home - str , all_road - str , all_neutral - str | show me about the proportion of all_games_percent and all_games_percent in a pie chart .
mark arc encoding x all_road y aggregate none all_games_percent
mark arc encoding x team_name y aggregate none all_games_percent

15 . 7 . transaction_id - int , investor_id - int , transaction_type_code - str , date_of_transaction - str , amount_of_transaction - float , share_count - f

16 . 8 . season - float , player - str , position - str , country - int , team - int , draft_pick_number - int , draft_class - str , college - str | what are the draft pick numbers and draft classes for players who play the defender position show bar chart , and could you order x-axis in descending order ?
mark bar encoding x draft_pick_number y aggregate none school transform sort x desc
mark bar encoding x draft_class y aggregate none draft_pick_number transform filter position = "defender" sort x desc

16 . 8 . season - float , player - str , position - str , country - int , team - int , draft_pick_number - int , draft_class - str , college - str | show the draft pick numbers and draft classes of players whose positions are defenders in a bar chart , and list y from high to low order .
mark bar encoding x position y aggregate count position transform filter position = "usa" sort y desc
mark bar encoding x draft_class y aggregate none draft_pick_number transform filter position = "de

7 . 4 . shop_id - int , employee_id - int , start_from - int , is_full_time - str | for the average of employee_id , start_from , visualize the trend .
mark line encoding x start_from y aggregate mean employee_id color is_full_time transform bin x by year
mark line encoding x start_from y aggregate mean employee_id transform bin x by year

50 . 4 . id - int , name - str , dept_name - str , salary - float | display a bar chart for what are the names and average salaries for departments with average salary higher than 42000 ? , list x axis in descending order .
mark bar encoding x name y aggregate mean salary transform filter salary > 42000 sort x desc
mark bar encoding x dept_name y aggregate mean salary transform sort x desc

34 . 8 . stuid - int , lname - str , fname - str , age - int , sex - str , major - int , advisor - int , city_code - str | what are the average ages for male and female students plot them as bar chart , and list by the bars from low to high .
mark bar encoding x s

34 . 8 . stuid - int , lname - str , fname - str , age - int , sex - str , major - int , advisor - int , city_code - str | visualize a bar chart for what are the average ages for male and female students ?
mark bar encoding x sex y aggregate mean age transform filter sex = 'm'
mark bar encoding x sex y aggregate mean age transform

4120 . 9 . player_id - str , yearid - int , votedby - str , ballots - float , needed - float , votes - float , inducted - str , category - str , needed_note - str | i want to see trend of the number of yearid by yearid , and i want to show in desc by the x .
mark line encoding x yearid y aggregate count yearid transform sort x desc bin x by yearid
mark line encoding x yearid y aggregate count yearid transform sort x desc bin x by year

15 . 5 . guest_id - int , gender_code - str , guest_first_name - str , guest_last_name - str , date_of_birth - str | return the date of birth for all the guests with gender code "male" , and count them by a line chart , i want

15 . 7 . apt_id - int , building_id - int , apt_type_code - str , apt_number - str , bathroom_count - int , bedroom_count - int , room_count - int | what are the apartment number and the room count of each apartment visualize by bar chart , sort total number in asc order .
mark bar encoding x apt_type_code y aggregate count apt_type_code transform sort y asc
mark bar encoding x apt_number y aggregate none room_count transform sort y asc

5 . 6 . party_id - int , minister - str , took_office - int , left_office - int , region_id - int , party_name - str | bar graph to show the number of took office from different took office
mark bar encoding x took_office y aggregate count took_office transform
mark bar encoding x took_office y aggregate count took_office transform bin x by weekday

15 . 7 . apt_id - int , building_id - int , apt_type_code - str , apt_number - str , bathroom_count - int , bedroom_count - int , room_count - int | give me a bar chart for room_count of each apt number
mar

5 . 3 . document_type_code - str , document_type_name - str , document_type_description - str | how many document type for different document type description ? visualize with a bar chart , display document_type_description from high to low order .
mark bar encoding x document_type_name y aggregate count document_type_name transform sort x desc
mark bar encoding x document_type_description y aggregate count document_type_description transform sort x desc

15 . 8 . student_id - int , date_of_registration - str , date_of_latest_logon - str , login_name - str , password - str , personal_name - str , middle_name - str , family_name - str | what are the dates of the latest logon of the students with family name "jaskolski" or "langosh" , and count them by a bar chart
mark bar encoding x date_of_latest_logon y aggregate count date_of_latest_logon transform filter personal_name like 'jaskolski' or 'langosh' bin x by weekday
mark bar encoding x date_of_latest_logon y aggregate count date_of_la

58 . 8 . facid - int , lname - str , fname - str , rank - str , sex - str , phone - int , room - str , building - str | stacked bar chart of the total number for with each sex in each rank , and show names from low to high order please .
mark bar encoding x rank y aggregate count rank transform sort x asc
mark bar encoding x rank y aggregate count rank color sex transform sort x asc

15 . 8 . student_id - int , date_of_registration - str , date_of_latest_logon - str , login_name - str , password - str , personal_name - str , middle_name - str , family_name - str | what are the number of the dates of the latest logon of the students with family name "jaskolski" or "langosh" ?
mark bar encoding x date_of_latest_logon y aggregate count date_of_latest_logon transform filter personal_name like 'jaskolski' or longosh
mark bar encoding x date_of_latest_logon y aggregate count date_of_latest_logon transform filter family_name = "jaskolski" or family_name = "langosh" bin x by weekday

58 . 8 . 

15 . 7 . apt_id - int , building_id - int , apt_type_code - str , apt_number - str , bathroom_count - int , bedroom_count - int , room_count - int | return a bar chart on what are the apartment number and the room count of each apartment ? , and rank in ascending by the y axis .
mark bar encoding x apt_type_code y aggregate count apt_type_code transform sort y asc
mark bar encoding x apt_number y aggregate none room_count transform sort y asc

15 . 5 . guest_id - int , gender_code - str , guest_first_name - str , guest_last_name - str , date_of_birth - str | return the date of birth for all the guests with gender code "male" , and count them by a bar chart , i want to order the number of date of birth from high to low order .
mark bar encoding x sex_date_of_birth y aggregate count sex_date_of_birth transform filter gender_last_name = 'male' sort y desc bin x by weekday
mark bar encoding x date_of_birth y aggregate count date_of_birth transform filter gender_code = "male" sort y desc bi

15 . 8 . player_id - int , name - str , position - str , club_id - int , apps - float , tries - float , goals - str , points - float | what is the number of players who have points less than 30 for each position ?
mark arc encoding x position y aggregate count position transform filter points < 30
mark bar encoding x position y aggregate count position transform filter points < 30

107 . 11 . employee_id - int , first_name - str , last_name - str , email - str , phone_number - str , hire_date - str , job_id - str , salary - int , commission_pct - float , manager_id - int , department_id - int | return a scatter on what is the average salary of employees who have a commission percentage that is not null ?
mark point encoding x mean salary y aggregate none commission_pct transform filter commission_pct != "null"
mark point encoding x department_id y aggregate mean salary transform filter commission_pct != "null"

16 . 8 . season - float , player - str , position - str , country - int , t

12 . 8 . station_id - int , name - str , annual_entry_exit - float , annual_interchanges - float , total_passengers - float , location - str , main_services - str , number_of_platforms - int | i want to know the proportion of the total number for each location .
mark arc encoding x location y aggregate sum number_of_platforms transform
mark arc encoding x location y aggregate count location transform

5 . 3 . document_type_code - str , document_type_name - str , document_type_description - str | show the number of document type for different document type description in a bar chart , and show y-axis in ascending order .
mark bar encoding x document_type_name y aggregate count document_type_name transform sort y asc
mark bar encoding x document_type_description y aggregate count document_type_description transform sort y asc

50 . 4 . id - int , name - str , dept_name - str , salary - float | what are the names and average salaries for departments with average salary higher than 42000 p

34 . 8 . stuid - int , lname - str , fname - str , age - int , sex - str , major - int , advisor - int , city_code - str | show the average age for male and female students visualize by bar chart , sort by the names from high to low .
mark bar encoding x sex y aggregate mean age transform filter sex = 'm' sort x desc
mark bar encoding x sex y aggregate mean age transform sort x desc

11 . 4 . code - int , name - str , price - int , manufacturer - int | draw a pie chart for what are the names and prices of products that cost at least 180 , sorted by price decreasing and name ascending ?
mark arc encoding x name y aggregate none price transform filter price >= 180 sort y desc name asc
mark arc encoding x name y aggregate none price transform filter price >= 180 name asc

6 . 9 . roller_coaster_id - int , name - str , park - str , country_id - int , length - float , height - float , speed - float , opened - str , status - str | scatter plot to show length on x axis and height on y axis .


15 . 5 . guest_id - int , gender_code - str , guest_first_name - str , guest_last_name - str , date_of_birth - str | what are the number of dates of birth of all the guests whose gender is "male" ? , list by the y-axis in descending .
mark bar encoding x sex_code y aggregate count sex_code transform filter sex_last_name = "male" sort y desc bin x by weekday
mark bar encoding x date_of_birth y aggregate count date_of_birth transform filter gender_code = "male" sort y desc bin x by weekday

6 . 5 . exhibition_id - int , year - int , theme - str , artist_id - int , ticket_price - float | show me how many year by year in a histogram , could you display y axis in asc order ?
mark bar encoding x year y aggregate count year transform sort y asc
mark bar encoding x year y aggregate count year transform sort y asc bin x by weekday

34 . 8 . stuid - int , lname - str , fname - str , age - int , sex - str , major - int , advisor - int , city_code - str | a bar chart about what are the average age

58 . 8 . facid - int , lname - str , fname - str , rank - str , sex - str , phone - int , room - str , building - str | how many faculty members do we have for each rank and gender ? plot them as bar chart , rank by the y-axis in descending please .
mark bar encoding x rank y aggregate count rank transform sort y desc
mark bar encoding x rank y aggregate count rank color sex transform sort y desc

58 . 8 . facid - int , lname - str , fname - str , rank - str , sex - str , phone - int , room - str , building - str | show all the ranks and the number of male and female faculty for each rank in a bar chart , rank by the y in ascending .
mark bar encoding x rank y aggregate count rank transform sort y asc
mark bar encoding x rank y aggregate count rank color sex transform sort y asc

10 . 9 . company_id - int , rank - int , company - str , headquarters - str , main_industry - str , sales_billion - float , profits_billion - float , assets_billion - float , market_value - float | find the ra

10 . 5 . technician_id - float , name - str , team - str , starting_year - float , age - int | a bar chart for what are the number of the names of the technicians by ascending order of age ?
mark bar encoding x name y aggregate count name transform sort y asc
mark bar encoding x name y aggregate count name transform sort age asc

4 . 3 . cname - str , state - str , enr - int | return a bar graph for the name of the school that has the smallest enrollment in each state , could you order by the x axis in asc please ?
mark bar encoding x name y aggregate min enr transform sort x asc
mark bar encoding x cname y aggregate min enr transform sort x asc

15 . 5 . guest_id - int , gender_code - str , guest_first_name - str , guest_last_name - str , date_of_birth - str | return the date of birth for all the guests with gender code "male" , and count them by a bar chart , list in ascending by the y axis .
mark bar encoding x gender_code y aggregate count gender_code transform filter gender_last_n

4 . 3 . cname - str , state - str , enr - int | return a bar graph for the name of the school that has the smallest enrollment in each state , and list total number in asc order .
mark bar encoding x name y aggregate min enr transform sort y asc
mark bar encoding x cname y aggregate min enr transform sort y asc

15 . 7 . transaction_id - int , investor_id - int , transaction_type_code - str , date_of_transaction - str , amount_of_transaction - float , share_count - float , other_details - float | a bar chart for what are the number of the dates of transactions with at least 100 share count or amount bigger than 100 ? , i want to display y-axis in ascending order .
mark bar encoding x date_of_transaction y aggregate count date_of_transaction transform filter share_count > 100 sort y asc bin x by weekday
mark bar encoding x date_of_transaction y aggregate count date_of_transaction transform filter share_count >= 100 or amount_of_transaction >= 100 sort y asc bin x by weekday

16 . 8 . se

50 . 4 . id - int , name - str , dept_name - str , salary - float | display a bar chart for what are the names and average salaries for departments with average salary higher than 42000 ?
mark bar encoding x name y aggregate mean salary transform filter salary > 42000
mark bar encoding x dept_name y aggregate mean salary transform

5 . 3 . document_type_code - str , document_type_name - str , document_type_description - str | compute the total the number of document type description across document type description as a pie chart .
mark arc encoding x document_type_name y aggregate count document_type_name transform
mark arc encoding x document_type_description y aggregate count document_type_description transform

16 . 8 . season - float , player - str , position - str , country - int , team - int , draft_pick_number - int , draft_class - str , college - str | what are the draft pick numbers and draft classes for players who play the defender position . show bar chart .
mark bar encod

15 . 9 . problem_log_id - int , assigned_to_staff_id - int , problem_id - int , problem_category_code - str , problem_status_code - str , log_entry_date - str , log_entry_description - str , log_entry_fix - str , other_log_details - str | find all the ids and dates of the logs for the problem whose id is 10 plot them as bar chart , i want to sort bars in asc order please .
mark bar encoding x log_entry_description y aggregate none log_entry_description transform sort x asc
mark bar encoding x log_entry_date y aggregate none problem_log_id transform filter problem_id = 10 sort x asc

15 . 7 . apt_id - int , building_id - int , apt_type_code - str , apt_number - str , bathroom_count - int , bedroom_count - int , room_count - int | give me a bar chart for room_count of each apt number , and rank in ascending by the y axis .
mark bar encoding x apt_type_code y aggregate none room_count transform sort y asc
mark bar encoding x apt_number y aggregate none room_count transform sort y asc

15 

3 . 8 . cust_id - int , cust_name - str , acc_type - str , acc_bal - int , no_of_loans - int , credit_score - int , branch_id - int , state - str | find the name and account balance of the customer whose name includes the letter ‘a’ . visualize them using a bar chart .
mark bar encoding x acc_type y aggregate none acc_bal transform filter credit_score = "a"
mark bar encoding x cust_name y aggregate none acc_bal transform filter cust_name like '%a%'

34 . 8 . stuid - int , lname - str , fname - str , age - int , sex - str , major - int , advisor - int , city_code - str | visualize a bar chart for what are the average ages for male and female students ? , could you rank from high to low by the names ?
mark bar encoding x sex y aggregate mean age transform filter sex = 'm' sort x desc
mark bar encoding x sex y aggregate mean age transform sort x desc

15 . 5 . guest_id - int , gender_code - str , guest_first_name - str , guest_last_name - str , date_of_birth - str | what are dates of birt

3 . 8 . cust_id - int , cust_name - str , acc_type - str , acc_bal - int , no_of_loans - int , credit_score - int , branch_id - int , state - str | what are the names and account balances of customers with the letter a in their names , show by the acc_bal in ascending .
mark bar encoding x acc_type y aggregate none acc_bal transform filter credit_score = "usa" sort x asc
mark bar encoding x cust_name y aggregate none acc_bal transform filter cust_name like '%a%' sort y asc

4 . 3 . cname - str , state - str , enr - int | return a bar graph for the name of the school that has the smallest enrollment in each state , and i want to rank by the y-axis in desc .
mark bar encoding x name y aggregate min enr transform sort y desc
mark bar encoding x cname y aggregate min enr transform sort y desc

10 . 5 . cinema_id - int , name - str , openning_year - int , capacity - int , location - str | plot sum capacity over openning year in a line chart
mark line encoding x openning_year y aggregate sum

6 . 6 . device_id - int , device - str , carrier - str , package_version - str , applications - str , software_platform - str | what are the different software platforms for devices , and how many devices have each .
mark bar encoding x software_platform y aggregate count software_platform transform
mark arc encoding x software_platform y aggregate count software_platform transform

15 . 13 . dog_id - int , owner_id - int , abandoned_yn - int , breed_code - str , size_code - str , name - str , age - int , date_of_birth - str , gender - int , weight - float , date_arrived - str , date_adopted - str , date_departed - str | create a bar chart showing the number of date departed across date departed , and i want to show in ascending by the y-axis .
mark bar encoding x date_departed y aggregate count date_departed transform sort y asc
mark bar encoding x date_departed y aggregate count date_departed transform sort y asc bin x by weekday

10 . 6 . movie_id - int , title - str , year - int , 

15 . 7 . apt_id - int , building_id - int , apt_type_code - str , apt_number - str , bathroom_count - int , bedroom_count - int , room_count - int | return the apartment number and the number of rooms for each apartment by a bar chart .
mark bar encoding x apt_type_code y aggregate count apt_type_code transform
mark bar encoding x apt_number y aggregate none room_count

15 . 9 . problem_log_id - int , assigned_to_staff_id - int , problem_id - int , problem_category_code - str , problem_status_code - str , log_entry_date - str , log_entry_description - str , log_entry_fix - str , other_log_details - str | find all the ids and dates of the logs for the problem whose id is 10 . plot them as bar chart .
mark bar encoding x log_entry_description y aggregate none log_entry_description transform filter log_entry_description != 10 bin x by weekday
mark bar encoding x log_entry_date y aggregate none problem_log_id transform filter problem_id = 10

34 . 8 . stuid - int , lname - str , fname - st

34 . 8 . stuid - int , lname - str , fname - str , age - int , sex - str , major - int , advisor - int , city_code - str | visualize a bar chart for what are the average ages for male and female students ? , and order y axis in desc order .
mark bar encoding x sex y aggregate mean age transform filter sex = 'm' sort y desc
mark bar encoding x sex y aggregate mean age transform sort y desc

5 . 3 . document_type_code - str , document_type_name - str , document_type_description - str | show the number of document type for different document type description in a bar chart .
mark bar encoding x document_type_name y aggregate count document_type_name transform
mark bar encoding x document_type_description y aggregate count document_type_description transform

15 . 5 . guest_id - int , gender_code - str , guest_first_name - str , guest_last_name - str , date_of_birth - str | what are the number of dates of birth of all the guests whose gender is "male" ? , and sort y in ascending order plea

In [None]:
source = test_df['source'][100]

In [None]:
input_ids = tokenizer(source, return_tensors="pt", max_length=512, padding=True, truncation=True).input_ids 
outputs = model.generate(input_ids)

decoded = ''.join(tokenizer.convert_ids_to_tokens(outputs[0])[1:-1]).replace('▁', ' ').strip()

In [None]:
source

In [None]:
decoded

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./results_causal/', local_files_only=True)
model = AutoModelForCausalLM.from_pretrained('./results_causal/', local_files_only=True)

In [None]:
source = '10 . 9 . company_id - int , rank - int , company - str , headquarters - str , main_industry - str , sales_billion - float , profits_billion - float , assets_billion - float , market_value - float | what is the market value of every'

input_ids = tokenizer(source, return_tensors="pt").input_ids 

logits = model(input_ids).logits[:, -1, :]

pred_ids = torch.argsort(logits)[0, -5:]
pred_words = [tokenizer.decode(pred_id) for pred_id in pred_ids]

In [None]:
pred_words

In [None]:
pred_ids = torch.argsort(logits)[0, -5:]
probs = logits[0][pred_ids] 
probs

In [None]:
logits[0][pred_ids]

In [None]:
probs

In [None]:
pred_ids

In [1]:
from nlplot import Nlplot
import pandas as pd

In [2]:
nlplot = Nlplot('./results_seq2seq/', './results_causal/')

In [61]:
df = pd.read_csv('./dataset/database/formula_1_drivers.csv')

In [62]:
df.head()

Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url
0,1,hamilton,44.0,HAM,Lewis,Hamilton,07/01/1985,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
1,2,heidfeld,,HEI,Nick,Heidfeld,10/05/1977,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
2,3,rosberg,6.0,ROS,Nico,Rosberg,27/06/1985,German,http://en.wikipedia.org/wiki/Nico_Rosberg
3,4,alonso,14.0,ALO,Fernando,Alonso,29/07/1981,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
4,5,kovalainen,,KOV,Heikki,Kovalainen,19/10/1981,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen


In [36]:
nlplot.specify_dataset(df)

In [42]:
nlplot.causal('plot sum of gross_in_dollar by Studio')

13 . 5 . film_id - int , title - str , studio - str , director - str , gross_in_dollar - int | plot how many title by studio


[' in', ' as', ' over', ' across', ' divided']

In [63]:
nlplot.seq2seq('plot sum of number by surname')

13 . 5 . film_id - int , title - str , studio - str , director - str , gross_in_dollar - int | plot sum of number by surname


'mark bar encoding x nominee y aggregate sum number transform'

In [51]:
tokenizer.tokenize('13 . 5 . film_id - int , title - str , studio - str , director - str , gross_in_dollar - int | plot sum of gross_in_dollar by Studio')

['▁13',
 '▁',
 '.',
 '▁5',
 '▁',
 '.',
 '▁film',
 '_',
 'id',
 '▁-',
 '▁in',
 't',
 '▁',
 ',',
 '▁title',
 '▁-',
 '▁str',
 '▁',
 ',',
 '▁studio',
 '▁-',
 '▁str',
 '▁',
 ',',
 '▁director',
 '▁-',
 '▁str',
 '▁',
 ',',
 '▁gross',
 '_',
 'in',
 '_',
 'd',
 'ol',
 'lar',
 '▁-',
 '▁in',
 't',
 '▁',
 '|',
 '▁plot',
 '▁sum',
 '▁of',
 '▁gross',
 '_',
 'in',
 '_',
 'd',
 'ol',
 'lar',
 '▁by',
 '▁Studi',
 'o']

In [64]:
train_df['source'][100]

'10 . 5 . cinema_id - int , name - str , openning_year - int , capacity - int , location - str | bar chart of how many openning year from each openning year , i want to show names in ascending order please .'