#### Importing the Necessary Libraries

In [1]:
# Import libraries for data analysis and manipulation.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import zscore
sns.set_theme(color_codes=True)
pd.set_option('display.max_columns', None)

# Import necessary libraries for model selection.
from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, KFold
#from sklearn.ensemble import StackingRegressor, BaggingRegressor, AdaBoostRegressor, VotingRegressor
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import warnings

# Filter the FutureWarning.
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

#### Loading the Dataset

In [2]:
#Load the dataset and display the first five records.
dairy_df = pd.read_csv('dairy_dataset.csv')
dairy_df.head()

Unnamed: 0,Location,Total Land Area (acres),Number of Cows,Farm Size,Date,Product ID,Product Name,Brand,Quantity (liters/kg),Price per Unit,Total Value,Shelf Life (days),Storage Condition,Production Date,Expiration Date,Quantity Sold (liters/kg),Price per Unit (sold),Approx. Total Revenue(INR),Customer Location,Sales Channel,Quantity in Stock (liters/kg),Minimum Stock Threshold (liters/kg),Reorder Quantity (liters/kg)
0,Telangana,310.84,96,Medium,2022-02-17,5,Ice Cream,Dodla Dairy,222.4,85.72,19064.128,25,Frozen,2021-12-27,2022-01-21,7,82.24,575.68,Madhya Pradesh,Wholesale,215,19.55,64.03
1,Uttar Pradesh,19.19,44,Large,2021-12-01,1,Milk,Amul,687.48,42.61,29293.5228,22,Tetra Pack,2021-10-03,2021-10-25,558,39.24,21895.92,Kerala,Wholesale,129,43.17,181.1
2,Tamil Nadu,581.69,24,Medium,2022-02-28,4,Yogurt,Dodla Dairy,503.48,36.5,18377.02,30,Refrigerated,2022-01-14,2022-02-13,256,33.81,8655.36,Madhya Pradesh,Online,247,15.1,140.83
3,Telangana,908.0,89,Small,2019-06-09,3,Cheese,Britannia Industries,823.36,26.52,21835.5072,72,Frozen,2019-05-15,2019-07-26,601,28.92,17380.92,Rajasthan,Online,222,74.5,57.68
4,Maharashtra,861.95,21,Medium,2020-12-14,8,Buttermilk,Mother Dairy,147.77,83.85,12390.5145,11,Refrigerated,2020-10-17,2020-10-28,145,83.07,12045.15,Jharkhand,Retail,2,76.02,33.4


In [3]:
dairy_df.drop_duplicates()

Unnamed: 0,Location,Total Land Area (acres),Number of Cows,Farm Size,Date,Product ID,Product Name,Brand,Quantity (liters/kg),Price per Unit,Total Value,Shelf Life (days),Storage Condition,Production Date,Expiration Date,Quantity Sold (liters/kg),Price per Unit (sold),Approx. Total Revenue(INR),Customer Location,Sales Channel,Quantity in Stock (liters/kg),Minimum Stock Threshold (liters/kg),Reorder Quantity (liters/kg)
0,Telangana,310.84,96,Medium,2022-02-17,5,Ice Cream,Dodla Dairy,222.40,85.72,19064.1280,25,Frozen,2021-12-27,2022-01-21,7,82.24,575.68,Madhya Pradesh,Wholesale,215,19.55,64.03
1,Uttar Pradesh,19.19,44,Large,2021-12-01,1,Milk,Amul,687.48,42.61,29293.5228,22,Tetra Pack,2021-10-03,2021-10-25,558,39.24,21895.92,Kerala,Wholesale,129,43.17,181.10
2,Tamil Nadu,581.69,24,Medium,2022-02-28,4,Yogurt,Dodla Dairy,503.48,36.50,18377.0200,30,Refrigerated,2022-01-14,2022-02-13,256,33.81,8655.36,Madhya Pradesh,Online,247,15.10,140.83
3,Telangana,908.00,89,Small,2019-06-09,3,Cheese,Britannia Industries,823.36,26.52,21835.5072,72,Frozen,2019-05-15,2019-07-26,601,28.92,17380.92,Rajasthan,Online,222,74.50,57.68
4,Maharashtra,861.95,21,Medium,2020-12-14,8,Buttermilk,Mother Dairy,147.77,83.85,12390.5145,11,Refrigerated,2020-10-17,2020-10-28,145,83.07,12045.15,Jharkhand,Retail,2,76.02,33.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4320,Delhi,748.71,89,Medium,2022-02-24,6,Curd,Mother Dairy,554.90,88.45,49080.9050,5,Refrigerated,2022-02-16,2022-02-21,352,87.20,30694.40,Uttar Pradesh,Online,202,98.07,33.53
4321,Jharkhand,385.91,29,Large,2022-05-14,4,Yogurt,Palle2patnam,818.33,55.35,45294.5655,23,Refrigerated,2022-03-22,2022-04-14,68,58.39,3970.52,Kerala,Retail,750,87.41,114.37
4322,Chandigarh,311.54,65,Small,2020-01-05,6,Curd,Mother Dairy,583.56,92.61,54043.4916,7,Refrigerated,2020-01-04,2020-01-11,141,89.46,12613.86,Haryana,Retail,442,33.47,153.66
4323,Maharashtra,890.55,90,Small,2022-10-25,6,Curd,Raj,3.10,15.30,47.4300,7,Refrigerated,2022-10-02,2022-10-09,2,10.56,21.12,Jharkhand,Wholesale,1,58.25,160.84


In [4]:
processed_data = dairy_df.fillna(0).replace([np.inf, -np.inf], 0)
print(processed_data)

           Location  Total Land Area (acres)  Number of Cows Farm Size  \
0         Telangana                   310.84              96    Medium   
1     Uttar Pradesh                    19.19              44     Large   
2        Tamil Nadu                   581.69              24    Medium   
3         Telangana                   908.00              89     Small   
4       Maharashtra                   861.95              21    Medium   
...             ...                      ...             ...       ...   
4320          Delhi                   748.71              89    Medium   
4321      Jharkhand                   385.91              29     Large   
4322     Chandigarh                   311.54              65     Small   
4323    Maharashtra                   890.55              90     Small   
4324      Rajasthan                   492.86              58     Large   

            Date  Product ID Product Name                 Brand  \
0     2022-02-17           5    Ice Cream   

In [5]:
processed_data.drop(['Product ID','Sales Channel','Price per Unit','Total Value','Price per Unit (sold)','Approx. Total Revenue(INR)'], axis=1, inplace=True)
print(processed_data)

           Location  Total Land Area (acres)  Number of Cows Farm Size  \
0         Telangana                   310.84              96    Medium   
1     Uttar Pradesh                    19.19              44     Large   
2        Tamil Nadu                   581.69              24    Medium   
3         Telangana                   908.00              89     Small   
4       Maharashtra                   861.95              21    Medium   
...             ...                      ...             ...       ...   
4320          Delhi                   748.71              89    Medium   
4321      Jharkhand                   385.91              29     Large   
4322     Chandigarh                   311.54              65     Small   
4323    Maharashtra                   890.55              90     Small   
4324      Rajasthan                   492.86              58     Large   

            Date Product Name                 Brand  Quantity (liters/kg)  \
0     2022-02-17    Ice Cream     

In [10]:
# Identify outliers for each numeric variable using z-score
z_scores = zscore(processed_data[['Total Land Area (acres)', 'Shelf Life (days)','Quantity Sold (liters/kg)','Quantity in Stock (liters/kg)','Minimum Stock Threshold (liters/kg)','Reorder Quantity (liters/kg)']])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
processed_data = processed_data[filtered_entries]

outliers_removed = len(processed_data) - len(processed_data[filtered_entries])
proportion_removed = outliers_removed / len(processed_data)
print(outliers_removed)
print(proportion_removed)

# Replace missing values with the mode for categorical variables
processed_data['Location'].fillna(processed_data['Location'].mode()[0], inplace=True)
processed_data['Farm Size'].fillna(processed_data['Farm Size'].mode()[0], inplace=True)
processed_data['Product Name'].fillna(processed_data['Product Name'].mode()[0], inplace=True)
processed_data['Brand'].fillna(processed_data['Brand'].mode()[0], inplace=True)
processed_data['Storage Condition'].fillna(processed_data['Storage Condition'].mode()[0], inplace=True)
processed_data['Customer Location'].fillna(processed_data['Customer Location'].mode()[0], inplace=True)

# Drop any remaining rows with missing values
processed_data.dropna(inplace=True)

# Replace income values with binary values (0 for '<=50K', 1 for '>50K')
# processed_data['income'] = processed_data['income'].replace({'<=50K': 0, '>50K': 1})

# Print the updated dataset
print(processed_data.head())

0
0.0
        Location  Total Land Area (acres)  Number of Cows Farm Size  \
0      Telangana                   310.84              96    Medium   
1  Uttar Pradesh                    19.19              44     Large   
2     Tamil Nadu                   581.69              24    Medium   
3      Telangana                   908.00              89     Small   
4    Maharashtra                   861.95              21    Medium   

         Date Product Name                 Brand  Quantity (liters/kg)  \
0  2022-02-17    Ice Cream           Dodla Dairy                222.40   
1  2021-12-01         Milk                  Amul                687.48   
2  2022-02-28       Yogurt           Dodla Dairy                503.48   
3  2019-06-09       Cheese  Britannia Industries                823.36   
4  2020-12-14   Buttermilk          Mother Dairy                147.77   

   Shelf Life (days) Storage Condition Production Date Expiration Date  \
0                 25            Frozen      2021

  outliers_removed = len(processed_data) - len(processed_data[filtered_entries])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['Location'].fillna(processed_data['Location'].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['Farm Size'].fillna(processed_data['Farm Size'].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['Product Name'].fillna(processed_data['Product Name'].mode()[0], inplace=True)
A value is trying to be set

In [11]:
# Merge encoded categorical data with scaled numeric data
numeric_features = ['Total Land Area (acres)', 'Shelf Life (days)','Quantity Sold (liters/kg)','Quantity in Stock (liters/kg)','Minimum Stock Threshold (liters/kg)','Reorder Quantity (liters/kg)']
scaler = StandardScaler()
scaled_data = scaler.fit_transform(processed_data[numeric_features])

In [13]:
categorical_features = ['Location', 'Farm Size','Product Name', 'Brand', 'Storage Condition', 'Customer Location']
processed_data = pd.concat([processed_data[categorical_features], pd.DataFrame(scaled_data, columns=numeric_features)], axis=1)
print(processed_data)

           Location Farm Size Product Name                 Brand  \
0         Telangana    Medium    Ice Cream           Dodla Dairy   
1     Uttar Pradesh     Large         Milk                  Amul   
2        Tamil Nadu    Medium       Yogurt           Dodla Dairy   
3         Telangana     Small       Cheese  Britannia Industries   
4       Maharashtra    Medium   Buttermilk          Mother Dairy   
...             ...       ...          ...                   ...   
3904            NaN       NaN          NaN                   NaN   
3936            NaN       NaN          NaN                   NaN   
3962            NaN       NaN          NaN                   NaN   
3984            NaN       NaN          NaN                   NaN   
3988            NaN       NaN          NaN                   NaN   

     Storage Condition Customer Location  Total Land Area (acres)  \
0               Frozen    Madhya Pradesh                -0.679499   
1           Tetra Pack            Kerala     

In [15]:
# Perform PCA

pca = PCA(n_components=2)
pca_data = processed_data[numeric_features]
pca_data = pca_data.fillna(pca_data.mean()) # Fill missing values with mean
pca_data = pca.fit_transform(pca_data)
for i in range(pca.n_components_):
    processed_data[f'pca_{i+1}'] = pca_data[:, i]

In [18]:
# Encode categorical features
for feature in categorical_features:
    le = LabelEncoder()
    processed_data[feature] = le.fit_transform(processed_data[feature])