# Data Analyst Agent using LLaMA 4 Maverick (Together.ai)

In [1]:
!pip install together openpyxl python-docx pdfplumber pytesseract pillow matplotlib seaborn pandas

Collecting together
  Downloading together-1.5.13-py3-none-any.whl.metadata (15 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting eval-type-backport<0.3.0,>=0.1.3 (from together)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting typer<0.16,>=0.9 (from together)
  Downloading typer-0.15.4-py3-none-any.whl.metadata (15 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     

In [17]:
import os
import pandas as pd
import pdfplumber
import pytesseract
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from docx import Document
import together
import io

In [18]:
# Replace with your actual API key
together.api_key = "YOUR_ACTUAL_TOGETHER_API_KEY"

In [19]:
def extract_text_from_docx(file):
    doc = Document(file)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_image(file):
    image = Image.open(file)
    return pytesseract.image_to_string(image)

def extract_text_from_txt(file):
    return file.read().decode('utf-8')

def extract_data(file):
    filename = file.name
    if filename.endswith('.csv'):
        return pd.read_csv(file)
    elif filename.endswith('.xlsx'):
        return pd.read_excel(file)
    elif filename.endswith('.txt'):
        return extract_text_from_txt(file)
    elif filename.endswith('.docx'):
        return extract_text_from_docx(file)
    elif filename.endswith('.pdf'):
        return extract_text_from_pdf(file)
    elif filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        return extract_text_from_image(file)
    else:
        return "Unsupported file format"

In [20]:
def ask_llama(prompt):
    response = together.Complete.create(
        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        prompt=prompt,
        max_tokens=512,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.1,
    )
    return response['output']['choices'][0]['text'].strip()

In [21]:
def generate_summary_and_suggestions(df):
    summary = f"Shape: {df.shape}\n\nHead:\n{df.head()}\n\nDescription:\n{df.describe()}\n"
    prompt = f"""You are a data analyst. A dataset has this structure:\n{df.head().to_string()}\n\n
Based on this, what key insights, patterns, or issues can be explored?
Suggest relevant visualizations and questions to ask."""
    insights = ask_llama(prompt)
    return summary, insights

def plot_graph(df, x, y, kind='bar'):
    plt.figure(figsize=(10, 6))
    if kind == 'bar':
        sns.barplot(x=x, y=y, data=df)
    elif kind == 'scatter':
        sns.scatterplot(x=x, y=y, data=df)
    elif kind == 'line':
        sns.lineplot(x=x, y=y, data=df)
    else:
        print("Unsupported plot type")
        return
    plt.title(f"{kind.capitalize()} Plot of {y} vs {x}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [22]:
def analyze_file(file):
    data = extract_data(file)
    if isinstance(data, pd.DataFrame):
        print("DataFrame Loaded. Here's a quick summary:")
        summary, suggestions = generate_summary_and_suggestions(data)
        print(summary)
        print("\nLLM Suggestions and Insights:")
        print(suggestions)
        globals()['df'] = data
    elif isinstance(data, str):
        print("Text Extracted:")
        print(data[:1000])
        response = ask_llama(f"Read this text and provide a summary and key points:\n\n{data}")
        print("\nLLM Summary:")
        print(response)
    else:
        print("Unable to analyze this file.")

In [23]:
from IPython.display import display
import ipywidgets as widgets

upload = widgets.FileUpload(accept='', multiple=False)
display(upload)

FileUpload(value={}, description='Upload')

In [24]:
uploaded_file = list(upload.value.values())[0]
file_content = io.BytesIO(uploaded_file['content'])
file_content.name = uploaded_file['metadata']['name']
analyze_file(file_content)

Text Extracted:
MYNTRA DATA ANALYST INTERVIEW
QUESTIONS
YOE: 2-5
CTC: 25+
Q1. Find the second highest salary without using LIMIT,
OFFSET, or TOP
Input Table: Employees
EmpID Name Salary
1 Alice 60000
2 Bob 80000
3 Charlie 75000
4 David 80000
5 Eve 90000
SQL Query:
SELECT MAX(Salary) AS Second_Highest_Salary
FROM Employees
WHERE Salary < (
SELECT MAX(Salary) FROM Employees
);
Expected Output:
Second_Highest_Salary
80000
Q2. Given a table of orders, write a query to find the running
total (cumulative sum) of revenue for each day
Input Table: Orders
OrderID OrderDate Revenue
101 2024-01-01 100
102 2024-01-01 200
103 2024-01-02 150
104 2024-01-03 300
105 2024-01-03 100
SQL Query:
SELECT
OrderDate,
SUM(Revenue) AS Daily_Revenue,
SUM(SUM(Revenue)) OVER (ORDER BY OrderDate) AS Running_Total_Revenue
FROM Orders
GROUP BY OrderDate
ORDER BY OrderDate;
Expected Output:
OrderDate Daily_Revenue Running_Total_Revenue
2024-01-01 300 300
2024-01-02 150 450
2024-01-03 400 850
Q3. Write an SQL query to 

  response = together.Complete.create(


AuthenticationError: Error code: 401 - {"message": "Invalid API key provided. You can find your API key at https://api.together.xyz/settings/api-keys.", "type_": "invalid_request_error", "code": "invalid_api_key"}

In [25]:
def ask_about_dataframe(question, df):
    prompt = f"You are given this DataFrame:\n{df.head().to_string()}\n\nQuestion: {question}\nAnswer:"
    return ask_llama(prompt)