In [1]:
import pandas as pd
import numpy as np
import json
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

configs = json.load(open("config.json"))
configs

{'row_read_folder_path': 'data/versions/18/csv',
 'row_write_folder_path': 'data/versions/18/tables'}

In [2]:
file_names = os.listdir(configs['row_write_folder_path'])
file_names

['market_prices_20251108-130512.csv',
 'market_prices_20251108-152423.csv',
 'market_prices_20251108-152737.csv']

In [3]:
file_name = os.listdir(configs['row_write_folder_path'])[-1]
full_data_path = os.path.join(configs['row_write_folder_path'], file_name)

In [4]:
df = pd.read_csv(full_data_path, low_memory=False)
df.head()

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_Price,Max_Price,Modal_Price,Commodity_Code
0,Andhra Pradesh,Anantapur,Anantapur,Ground Nut Seed,Ground Nut Seed,Medium,2011-01-01,3800.0,3900.0,3850.0,268
1,Andhra Pradesh,Anantapur,Dharmavaram,Paddy (Dhan)(Common),B P T,Medium,2011-01-01,970.0,1030.0,1000.0,2
2,Andhra Pradesh,Anantapur,Guntakal,Ground Nut Seed,Ground Nut Seed,Medium,2011-01-01,4300.0,4500.0,4400.0,268
3,Andhra Pradesh,Anantapur,Hindupur,Dry Chillies,2nd Sort,Medium,2011-01-01,2400.0,4000.0,3400.0,132
4,Andhra Pradesh,Anantapur,Hindupur,Tamarind Fruit,Flower A/c,Medium,2011-01-01,3600.0,4900.0,4250.0,261


In [5]:
df.columns

Index(['State', 'District', 'Market', 'Commodity', 'Variety', 'Grade',
       'Arrival_Date', 'Min_Price', 'Max_Price', 'Modal_Price',
       'Commodity_Code'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20021522 entries, 0 to 20021521
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   State           object 
 1   District        object 
 2   Market          object 
 3   Commodity       object 
 4   Variety         object 
 5   Grade           object 
 6   Arrival_Date    object 
 7   Min_Price       float64
 8   Max_Price       float64
 9   Modal_Price     float64
 10  Commodity_Code  int64  
dtypes: float64(3), int64(1), object(7)
memory usage: 1.6+ GB


In [7]:
df['Arrival_Date'].value_counts().sort_index(ascending=True)

Arrival_Date
2011-01-01    2671
2011-01-02    1345
2011-01-03    2976
2011-01-04    2871
2011-01-05    3056
              ... 
2025-11-02    1734
2025-11-03    2384
2025-11-04    4114
2025-11-05     700
2025-11-06       5
Name: count, Length: 5424, dtype: int64

In [8]:
cat_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(exclude='object').columns
print("Cat Columns: ", cat_cols)
print("Num Columns: ", num_cols)

Cat Columns:  Index(['State', 'District', 'Market', 'Commodity', 'Variety', 'Grade',
       'Arrival_Date'],
      dtype='object')
Num Columns:  Index(['Min_Price', 'Max_Price', 'Modal_Price', 'Commodity_Code'], dtype='object')


In [18]:
from src.data_processing import EncodelData, ScaleData, Imputer
from sklearn.pipeline import Pipeline


pipeline_steps = [
        ("imputer", Imputer(cat_cols=cat_cols, num_cols=num_cols, num_method="mean")),
        ("scaler", ScaleData(num_cols=num_cols, method="minmax")),
        ("encoder", EncodelData(cat_cols=cat_cols, method="label"))
    ]
processing_pipeline_ = Pipeline(pipeline_steps)
processing_pipeline_

0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False


In [19]:
processed_df = processing_pipeline_.fit_transform(df.head(100000))
processed_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'processing_metrics/scaler.pkl'

In [None]:
test = df.iloc[0:10]
test