In [1]:
!pip install openai awswrangler --quiet


import pandas as pd
import numpy as np
import datetime

import os
import openai

import awswrangler as wr

In [2]:
key_file = open('/home/ec2-user/SageMaker/open_key.txt')
open_key = key_file.read()

os.environ["OPENAI_API_KEY"] = open_key
openai.api_key = os.environ['OPENAI_API_KEY']

In [3]:
from openai import OpenAI
client = OpenAI()

In [4]:
query_string = "Write a SQL query that gets all machines with over 600 speed difference. Return timestamp, machine ID, and speed difference. Sort by highest speed difference. Top 20 results"

response = client.chat.completions.create(
    model='gpt-4',
    messages=[
        {
            "role": "system",
            "content": "Given the following SQL table, your job is to write queries given a user's request \n CREATE TABLE sample_extended (\n timestamp DateTime,\n speed_desired Int,\n ambient_temperature Float,\n ambient_pressure Float,\n speed Float,\n temperature Float,\n pressure Float,\n machineid String,\n speed_difference Float;"
        },
        {
            "role": "user",
            "content": query_string,
        }
    ],
    temperature=0.1,
    max_tokens=1000,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)

output_query = response.choices[0].message.content
# Completed in 1 second

In [5]:
output_query

'SELECT timestamp, machineid, speed_difference\nFROM sample_extended\nWHERE speed_difference > 600\nORDER BY speed_difference DESC\nLIMIT 20;'

In [6]:
df = wr.athena.read_sql_query(
    sql=output_query,
    database='capstone',
    ctas_approach=True)

In [7]:
df

Unnamed: 0,timestamp,machineid,speed_difference
0,2017-06-20 00:40:12.000,M_0259,770.75
1,2017-06-09 14:59:33.000,M_0084,770.31
2,2017-06-21 08:10:30.000,M_0014,770.12
3,2017-06-22 03:56:08.000,M_0776,767.65
4,2017-06-21 08:10:29.000,M_0014,743.92
5,2017-06-09 14:59:32.000,M_0084,743.2
6,2017-06-06 16:46:40.000,M_0084,741.44
7,2017-06-20 00:40:11.000,M_0259,738.67
8,2017-06-22 03:56:07.000,M_0776,737.93
9,2017-06-21 08:10:28.000,M_0014,726.27


In [8]:
context = "Here is the context: We are looking at industrial factory machine data. Speed differences of 232 and over are worse as you get higher. You will be provided a string representation of a dataframe with SQL query results. Your job is to summarize thes results and give recommendations for how to address these issues. Here is the corresponding SQL query: " + output_query 

In [9]:
response = client.chat.completions.create(
    model='gpt-4',
    messages=[
        {
            "role": "system",
            "content": context
        },
        {
            "role": "user",
            "content": "Please give me a summary of these results" + str(df)
        }
    ],
    temperature=0.1,
    max_tokens=1000,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)

output_query = response.choices[0].message.content

In [10]:
print(response.choices[0].message.content)

The SQL query results show the top 20 instances where the speed difference exceeded 600 in the sample_extended table. The speed differences range from 703.59 to 770.75. The machine with the highest speed difference of 770.75 is M_0259, recorded on 2017-06-20 at 00:40:12.000. 

Machine M_0084, M_0014, M_0259, and M_0776 appear multiple times in the top 20 list, indicating that these machines have recurring speed difference issues. 

To address these issues, I recommend the following:

1. Investigate the machines M_0084, M_0014, M_0259, and M_0776 for any mechanical issues that could be causing these high speed differences. This could involve checking the machines' components, software, and their operational environment.

2. Review the operational procedures and maintenance schedules for these machines. If the machines are not being properly maintained or are being used in ways they were not designed for, it could lead to these high speed differences.

3. Consider implementing real-time 