# Avocado Prices Analysis

Explore and visualize the Avocado Prices and Sales Volume dataset (2015–2023).

In [1]:
import kagglehub
import pandas as pd
import zipfile
import os
import tempfile
import shutil

# 1. Download latest version of the dataset
zip_or_dir = kagglehub.dataset_download("ksamiksha19/vegetable-prices")
print("Downloaded to:", zip_or_dir)

# 2. Prepare a temp directory for extraction or reading
workdir = tempfile.mkdtemp(prefix="veg_prices_")

try:
    # 3. If it's a ZIP, extract; otherwise assume it's a directory
    if zipfile.is_zipfile(zip_or_dir):
        with zipfile.ZipFile(zip_or_dir, "r") as z:
            z.extractall(workdir)
    else:
        # copy all files into workdir
        for fname in os.listdir(zip_or_dir):
            shutil.copy(os.path.join(zip_or_dir, fname), workdir)

    # 4. Find all CSV files in workdir
    csv_files = [
        os.path.join(workdir, f)
        for f in os.listdir(workdir)
        if f.lower().endswith(".csv")
    ]
    if not csv_files:
        raise FileNotFoundError("No CSV files found in the dataset.")

    # 5. Read and concatenate into one DataFrame
    dfs = [pd.read_csv(f) for f in csv_files]
    df = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(dfs)} file(s) into a single DataFrame with shape {df.shape}.")

    # 6. (Optional) quick preview
    print(df.head())

finally:
    # clean up temp folder
    shutil.rmtree(workdir)

Downloaded to: /Users/thabangmathebula/.cache/kagglehub/datasets/ksamiksha19/vegetable-prices/versions/3
Loaded 1 file(s) into a single DataFrame with shape (287, 11).
  Price Dates  Bhindi (Ladies finger)  Tomato  Onion  Potato  Brinjal  Garlic  \
0  01-01-2023                    35.0      18   22.0      20       30      50   
1  02-01-2023                    35.0      16   22.0      20       30      55   
2  03-01-2023                    35.0      16   21.0      20       30      55   
3  04-01-2023                    30.0      16   21.0      22       25      55   
4  08-01-2023                    35.0      16   20.0      21       25      55   

   Peas  Methi  Green Chilli  Elephant Yam (Suran)  
0    25      8          45.0                    25  
1    25      7          40.0                    25  
2    25      7          40.0                    25  
3    25      7          40.0                    25  
4    22      6          35.0                    25  


## Preliminary Data Inspection

In [2]:
# Show structure and types
df.info()

# Describe numeric columns
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Price Dates             287 non-null    object 
 1   Bhindi (Ladies finger)  287 non-null    float64
 2   Tomato                  287 non-null    int64  
 3   Onion                   287 non-null    float64
 4   Potato                  287 non-null    int64  
 5   Brinjal                 287 non-null    int64  
 6   Garlic                  287 non-null    int64  
 7   Peas                    287 non-null    int64  
 8   Methi                   287 non-null    int64  
 9   Green Chilli            287 non-null    float64
 10  Elephant Yam (Suran)    287 non-null    int64  
dtypes: float64(3), int64(7), object(1)
memory usage: 24.8+ KB


Unnamed: 0,Bhindi (Ladies finger),Tomato,Onion,Potato,Brinjal,Garlic,Peas,Methi,Green Chilli,Elephant Yam (Suran)
count,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0,287.0
mean,29.444251,16.006969,20.649826,18.585366,31.655052,133.101045,66.658537,20.383275,44.122404,28.797909
std,8.124815,0.118056,11.711204,2.726238,11.725421,60.078331,33.302415,117.428417,12.79659,6.607973
min,17.0,16.0,8.0,12.0,14.0,50.0,22.0,5.0,0.13,12.0
25%,22.0,16.0,12.0,16.0,25.0,85.0,40.0,8.0,35.0,25.0
50%,27.5,16.0,16.0,20.0,30.0,120.0,60.0,12.0,40.0,30.0
75%,33.0,16.0,25.0,20.0,35.0,165.0,80.0,16.0,50.0,30.0
max,60.0,18.0,57.0,24.0,80.0,290.0,150.0,2000.0,90.0,50.0
