# Spark and ChatGPT

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Spark-Example-30-ChatGPT') \
    .getOrCreate()

You can create a OpenAI API key on the following page: https://platform.openai.com/account/api-keys

In [2]:
# Bring in deps
import os 
from langchain.llms import OpenAI
from pyspark.sql import SparkSession
from langchain_experimental.agents import create_spark_dataframe_agent
from working.apikey import apikey
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# create a file named apikey.py and add the following line: apikey = 'sk-xxxxxxxxxxxxxxxxxxxxxxxx' (where xxxx is your api key)
os.environ['OPENAI_API_KEY'] = apikey

In [3]:
csv_file_path = "data\\taxi-data-sorted-verysmall-header.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.show()

+--------------------+--------------------+-------------------+-------------------+---------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+--------------------+--------------------+-------------------+-------------------+---------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|07290D3599E7A0D62...|E7750A37CAB07D0DF...|2013-01-01 00:00:00|2013-01-01 00:02:00|      120|         0.44|      -73.956528|      40.716976|        -73.96244|       40.715008|         CSH|        3.5|      0.5|    0.5|       0.0|   

# Create an Spark DataFrame LangChain agent 

In [4]:
from langchain_community.chat_models import ChatOpenAI
# set llm model of openai to chatgpt 4.0
# llmc = ChatOpenAI(model_name = "gpt-4-turbo-preview", temperature=0)
llm = OpenAI(temperature=0)


In [5]:
llm.model_name

'gpt-3.5-turbo-instruct'

In [6]:
agent = create_spark_dataframe_agent(llm=llm, df=df, verbose=True)

# Run the agent

In [7]:
agent.run("how many rows are there?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to count the number of rows in the dataframe
Action: python_repl_ast
Action Input: len(df.collect())[0m
Observation: [36;1m[1;3m10000[0m
Thought:[32;1m[1;3m I need to print the result
Action: python_repl_ast
Action Input: print(len(df.collect()))[0m
Observation: [36;1m[1;3m10000
[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: There are 10000 rows in the dataframe.[0m

[1m> Finished chain.[0m


'There are 10000 rows in the dataframe.'

In [8]:
agent.run("the top ten taxis that have had the largest number of drivers")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to group the dataframe by medallion and count the number of unique hack licenses for each medallion. Then I can sort the results in descending order and take the top ten.
Action: python_repl_ast
Action Input: df.groupBy("medallion").agg(countDistinct("hack_license").alias("num_drivers")).orderBy(desc("num_drivers")).limit(10)[0m
Observation: [36;1m[1;3mNameError: name 'countDistinct' is not defined[0m
Thought:[32;1m[1;3m I need to import the necessary functions from pyspark.sql.functions
Action: python_repl_ast
Action Input: from pyspark.sql.functions import countDistinct, desc[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m Now I can try the previous action again
Action: python_repl_ast
Action Input: df.groupBy("medallion").agg(countDistinct("hack_license").alias("num_drivers")).orderBy(desc("num_drivers")).limit(10)[0m
Observation: [36;1m[1;3mDataFrame[medallion: string, num_drivers: bigint

'The top ten taxis that have had the largest number of drivers are the first ten rows of the output above.'

In [9]:
agent.run("who are the top 10 best drivers are in terms of their average earned money per minute spent carrying a customer")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to calculate the average earned money per minute for each driver and then sort the drivers in descending order to find the top 10.
Action: python_repl_ast
Action Input: df.groupby('hack_license').agg({'total_amount': 'mean', 'trip_time': 'mean'}).sort('total_amount', ascending=False).show(10)[0m
Observation: [36;1m[1;3mAnalysisException: cannot resolve 'total_amount' given input columns: [avg(total_amount), avg(trip_time), hack_license];
'Sort ['total_amount DESC NULLS LAST], true
+- Aggregate [hack_license#17], [hack_license#17, avg(trip_time#20) AS avg(trip_time)#305, avg(total_amount#32) AS avg(total_amount)#306]
   +- Relation [medallion#16,hack_license#17,pickup_datetime#18,dropoff_datetime#19,trip_time#20,trip_distance#21,pickup_longitude#22,pickup_latitude#23,dropoff_longitude#24,dropoff_latitude#25,payment_type#26,fare_amount#27,surcharge#28,mta_tax#29,tip_amount#30,tolls_amount#31,total_amount#32] 

'The top 10 best drivers in terms of their average earned money per minute spent carrying a customer are:\n1. 0CA5DC811EEB2A23F...\n2. 326A602DF40DC0063...\n3. 90253C37D48D6BDCA...\n4. B391B47E5CFAA2498...\n5. 0A25F8C6AB3435095...\n6. 4716E14488BD15256...\n7. B72DEE45763A098CE...\n8. 632F6951F36191B8B...\n9. 0B553AE759E33550B...\n10. 08DB3F9FCF01530D6...'

In [10]:
agent.run("How many percent of taxi customers pay with cash, and how many percent use electronic cards?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the total number of customers and the number of customers who pay with cash and electronic cards.
Action: python_repl_ast
Action Input: df.count()[0m
Observation: [36;1m[1;3m10000[0m
Thought:[32;1m[1;3m This is the total number of customers.
Action: python_repl_ast
Action Input: df.filter(df.payment_type == 'CSH').count()[0m
Observation: [36;1m[1;3m5998[0m
Thought:[32;1m[1;3m This is the number of customers who pay with cash.
Action: python_repl_ast
Action Input: df.filter(df.payment_type != 'CSH').count()[0m
Observation: [36;1m[1;3m4002[0m
Thought:[32;1m[1;3m This is the number of customers who use electronic cards.
Action: python_repl_ast
Action Input: (df.filter(df.payment_type == 'CSH').count() / df.count()) * 100[0m
Observation: [36;1m[1;3m59.98[0m
Thought:[32;1m[1;3m This is the percentage of customers who pay with cash.
Action: python_repl_ast
Action Input: (df.filter(df.

'59.98% of taxi customers pay with cash, and 40.02% use electronic cards.'

In [11]:
import pyspark.sql.functions as F
agent.run("Find the top-10 most efficient taxi divers based on earned money per mile. ")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to calculate the earned money per mile for each taxi driver and then sort the results to find the top-10.
Action: python_repl_ast
Action Input: df.withColumn('earned_money_per_mile', df['total_amount']/df['trip_distance']).sort('earned_money_per_mile', ascending=False).show(10)[0m
Observation: [36;1m[1;3m+--------------------+--------------------+-------------------+-------------------+---------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+---------------------+
|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|earned_money_per_mile|
+--------------------+--------------------+---------------

'The top-10 most efficient taxi drivers based on earned money per mile are:\n1. B7C3A1C10F973348B... with an earned money per mile of 2720.0\n2. 80AC2E141CB5B9D8B... with an earned money per mile of 2625.0\n3. 6210101AB5620CDF4... with an earned money per mile of 1850.0\n4. 9786E56B89705E96F... with an earned money per mile of 1125.0\n5. F26AEE7D5BBAFF3B2... with an earned money per mile of 1050.0\n6. D2DDDAC4772125372... with an earned money per mile of 1004.1666666666667\n7. 2E1685CA576A1D035... with an earned money per mile of 979.1666666666667\n8. 96FCD5A1B4C5C1527... with an earned money per mile of 900.0\n9. CF72B'

In [12]:
agent.run("What are the mean, median of tip amount?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to use the describe function to get the mean and median of tip amount
Action: python_repl_ast
Action Input: df.describe('tip_amount')[0m
Observation: [36;1m[1;3mDataFrame[summary: string, tip_amount: string][0m
Thought:[32;1m[1;3m I need to select the mean and median from the dataframe
Action: python_repl_ast
Action Input: df.describe('tip_amount').select('mean', 'median')[0m
Observation: [36;1m[1;3mAnalysisException: cannot resolve 'mean' given input columns: [summary, tip_amount];
'Project ['mean, 'median]
+- LocalRelation [summary#1248, tip_amount#1249]
[0m
Thought:[32;1m[1;3m I need to use the agg function to calculate the mean and median
Action: python_repl_ast
Action Input: df.agg({'tip_amount': 'mean', 'tip_amount': 'median'})[0m
Observation: [36;1m[1;3mAnalysisException: Undefined function: 'median'. This function is neither a registered temporary function nor a permanent function regist

'The mean of tip amount is 0.0 and the median is also 0.0.'

In [12]:
agent.run("Create a linear model to predict the total paid amounts of Taxi rides based on duration. What will be the predicted total paid amount if the duration is 10 minutes?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to use a linear regression model to predict the total paid amount based on the duration of the taxi ride.
Action: python_repl_ast
Action Input: from pyspark.ml.regression import LinearRegression[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m I need to create a linear regression object and fit it to my dataframe.
Action: python_repl_ast
Action Input: lr = LinearRegression(featuresCol='trip_time', labelCol='total_amount')[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m I need to split my data into training and testing sets.
Action: python_repl_ast
Action Input: train_data, test_data = df.randomSplit([0.7, 0.3])[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m I need to fit my model to the training data.
Action: python_repl_ast
Action Input: lr_model = lr.fit(train_data)[0m
Observation: [36;1m[1;3mIllegalArgumentException: requirement failed: Column trip_time must be of type class or

'Agent stopped due to iteration limit or time limit.'

In [13]:
agent.run("Create a linear model to predict the total paid amounts of Taxi rides based on duration, trip_distance, fare_amount, and tolls_amount.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to import the necessary libraries and functions for creating a linear model.
Action: python_repl_ast
Action Input: import pyspark[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m I need to create a SparkSession to work with the dataframe.
Action: python_repl_ast
Action Input: from pyspark.sql import SparkSession[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m I need to create a dataframe with the necessary columns for the linear model.
Action: python_repl_ast
Action Input: df_model = df.select("trip_time", "trip_distance", "fare_amount", "tolls_amount", "total_amount")[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m I need to import the necessary functions for creating a linear regression model.
Action: python_repl_ast
Action Input: from pyspark.ml.regression import LinearRegression[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3m I need to split the data into training and tes

'I need to create a features column that combines the input features and transform the data before fitting and evaluating the linear regression model.'