In [6]:
import pandas as pd
import numpy as np
import json
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

configs = json.load(open("config.json"))
configs

{'row_read_folder_path': 'data/versions/18/csv',
 'row_write_folder_path': 'data/versions/18/tables',
 'processed_data_path': 'data/versions/18/tables/processed'}

In [7]:
file_names = os.listdir(configs['row_write_folder_path'])
file_names

['market_prices_20251108-130512.csv',
 'market_prices_20251108-152423.csv',
 'market_prices_20251108-152737.csv',
 'market_prices_20251109-103741.csv',
 'market_prices_20251109-111758.csv']

In [8]:
file_name = os.listdir(configs['row_write_folder_path'])[-1]
full_data_path = os.path.join(configs['row_write_folder_path'], file_name)

In [9]:
df = pd.read_csv(full_data_path, low_memory=False)
df.head()

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_Price,Max_Price,Modal_Price,Commodity_Code
0,Andhra Pradesh,Chittor,Chittoor,Gur (Jaggery),NO 1,FAQ,2022-01-01,4000.0,4400.0,4000.0,74
1,Andhra Pradesh,Chittor,Palamaner,Beetroot,Beetroot,FAQ,2022-01-01,3000.0,5000.0,4000.0,157
2,Andhra Pradesh,Chittor,Palamaner,Bottle gourd,Bottle Gourd,FAQ,2022-01-01,900.0,1500.0,1200.0,82
3,Andhra Pradesh,Chittor,Palamaner,Cauliflower,Cauliflower,FAQ,2022-01-01,1800.0,3000.0,2400.0,34
4,Andhra Pradesh,Chittor,Palamaner,Green Chilli,Green Chilly,FAQ,2022-01-01,4000.0,6000.0,5000.0,87


In [10]:
df.columns

Index(['State', 'District', 'Market', 'Commodity', 'Variety', 'Grade',
       'Arrival_Date', 'Min_Price', 'Max_Price', 'Modal_Price',
       'Commodity_Code'],
      dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6697382 entries, 0 to 6697381
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   State           object 
 1   District        object 
 2   Market          object 
 3   Commodity       object 
 4   Variety         object 
 5   Grade           object 
 6   Arrival_Date    object 
 7   Min_Price       float64
 8   Max_Price       float64
 9   Modal_Price     float64
 10  Commodity_Code  int64  
dtypes: float64(3), int64(1), object(7)
memory usage: 562.1+ MB


In [12]:
df['Arrival_Date'].value_counts().sort_index(ascending=True)

Arrival_Date
2022-01-01    4000
2022-01-02    1938
2022-01-03    4857
2022-01-04    4736
2022-01-05    4624
              ... 
2025-11-02    1734
2025-11-03    2384
2025-11-04    4114
2025-11-05     700
2025-11-06       5
Name: count, Length: 1406, dtype: int64

In [13]:
cat_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(exclude='object').columns
print("Cat Columns: ", cat_cols)
print("Num Columns: ", num_cols)

Cat Columns:  Index(['State', 'District', 'Market', 'Commodity', 'Variety', 'Grade',
       'Arrival_Date'],
      dtype='object')
Num Columns:  Index(['Min_Price', 'Max_Price', 'Modal_Price', 'Commodity_Code'], dtype='object')


In [14]:
from src.data_processing import EncodelData, ScaleData, Imputer
from sklearn.pipeline import Pipeline


pipeline_steps = [
        ("imputer", Imputer(cat_cols=cat_cols, num_cols=num_cols, num_method="mean")),
        ("scaler", ScaleData(num_cols=num_cols, method="minmax")),
        ("encoder", EncodelData(cat_cols=cat_cols, method="label"))
    ]
processing_pipeline_ = Pipeline(pipeline_steps)
processing_pipeline_

0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False


In [15]:
processed_df = processing_pipeline_.fit_transform(df.head(100000))
processed_df.head()

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_Price,Max_Price,Modal_Price,Commodity_Code
0,1,96,446,113,363,0,0,0.005,0.0055,0.004999,0.176755
1,1,96,1462,24,63,0,0,0.00375,0.00625,0.004999,0.377724
2,1,96,1462,34,88,0,0,0.001125,0.001875,0.001499,0.196126
3,1,96,1462,48,104,0,0,0.00225,0.00375,0.002999,0.079903
4,1,96,1462,101,211,0,0,0.005,0.0075,0.006249,0.208232


In [22]:
import joblib
import os
processing_configs = json.load(open("src/processing_config.json"))

class InverseDataProcessing:
    """
    This class is responsible for inversing the data processing techniques
    - Encoder Inversion (LabelEncoder or OneHot)
    - Scaler Inversion (Minmax or StandardScaler)
    """

    def __init__(self, cat_cols:list, num_cols:list, scale_method:str, encoder_method:str):
        self.cat_cols = cat_cols
        self.num_cols = num_cols
        self.encoder_method = encoder_method
        self.scale_method = scale_method


    def inverseEncoder(self, X:pd.DataFrame):
        X_copy = X.copy()
        if self.encoder_method == 'label':
            encoder_names = os.listdir(processing_configs['label_encoder_folder_path'])
            for name in encoder_names:
                encoder = joblib.load(os.path.join(processing_configs['label_encoder_folder_path'], name))
                X_copy[name.replace(".pkl", "")] = encoder.inverse_transform(X_copy[name.replace(".pkl", "")].astype(int))

            return X_copy

        elif self.encoder_method == 'onehot':
            encoder = joblib.load(processing_configs['one_hot_encoder_file_path'])
            X_copy = encoder.inverse_transform(X_copy[self.cat_cols])
            return X_copy

        else:
            raise ValueError("Choose either 'onehot' or 'label'")


    def inverseScale(self, X:pd.DataFrame):
        """
        Args:
            X: pd.DataFrame
        Description: Inverse the scaled data

        Returns:
            pd.DataFrame
        """
        X_copy = X.copy()
        scaler = joblib.load(processing_configs['scaler_file_path'])
        X_copy = scaler.inverse_transform(X_copy)
        return X_copy





In [None]:
inverse = InverseDataProcessing(cat_cols=cat_cols, num_cols=num_cols, scale_method='minmax', encoder_method='label')
inverse_encoded_data = inverse.inverseEncoder(processed_df.head(5))
inverse_data = inverse.inverseScale(inverse_encoded_data)
inverse_data.head()

In [18]:
df.head()

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_Price,Max_Price,Modal_Price,Commodity_Code
0,Andhra Pradesh,Chittor,Chittoor,Gur (Jaggery),NO 1,FAQ,2022-01-01,4000.0,4400.0,4000.0,74
1,Andhra Pradesh,Chittor,Palamaner,Beetroot,Beetroot,FAQ,2022-01-01,3000.0,5000.0,4000.0,157
2,Andhra Pradesh,Chittor,Palamaner,Bottle gourd,Bottle Gourd,FAQ,2022-01-01,900.0,1500.0,1200.0,82
3,Andhra Pradesh,Chittor,Palamaner,Cauliflower,Cauliflower,FAQ,2022-01-01,1800.0,3000.0,2400.0,34
4,Andhra Pradesh,Chittor,Palamaner,Green Chilli,Green Chilly,FAQ,2022-01-01,4000.0,6000.0,5000.0,87
