In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load dataset
data = pd.read_csv('../dataset/crop_production.csv')

# Drop unnecessary columns or missing values
data.dropna(inplace=True)

# Example of encoding categorical columns (e.g., State, Crop)
categorical_cols = ['State_Name', 'District_Name', 'Season', 'Crop']
encoder = LabelEncoder()

for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])

# Scaling numerical columns (e.g., Area, Production)
scaler = MinMaxScaler()
numerical_cols = ['Area', 'Production']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Save preprocessed dataset
data.to_csv('./crop_production_processed.csv', index=False)


In [6]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import numpy as np
import pandas as pd

# Load preprocessed dataset
data = pd.read_csv('./crop_production_processed.csv')

# Define features and target
features = data.drop(columns=['Production']).values
target = data['Production'].values

# Define sequence length
sequence_length = 10

# Define an 80-20 train-test split index
split_index = int(len(features) * 0.8)

# Split into training and testing sets
train_features = features[:split_index]
train_target = target[:split_index]

test_features = features[split_index:]
test_target = target[split_index:]

# Generate sequences for training and validation sets
sequence_length = 10
batch_size = 32

train_generator = TimeseriesGenerator(train_features, train_target, length=sequence_length, batch_size=32)
val_generator = TimeseriesGenerator(test_features, test_target, length=sequence_length)

In [7]:
train_features

array([[0.00000000e+00, 4.27000000e+02, 2.00000000e+03, 1.00000000e+00,
        2.00000000e+00, 1.46140490e-04],
       [0.00000000e+00, 4.27000000e+02, 2.00000000e+03, 1.00000000e+00,
        7.40000000e+01, 2.21442643e-07],
       [0.00000000e+00, 4.27000000e+02, 2.00000000e+03, 1.00000000e+00,
        9.50000000e+01, 1.18763186e-05],
       ...,
       [2.90000000e+01, 4.33000000e+02, 2.00700000e+03, 1.00000000e+00,
        6.30000000e+01, 1.07108310e-05],
       [2.90000000e+01, 4.33000000e+02, 2.00700000e+03, 1.00000000e+00,
        7.40000000e+01, 5.41835183e-05],
       [2.90000000e+01, 4.33000000e+02, 2.00700000e+03, 1.00000000e+00,
        9.50000000e+01, 4.88338137e-03]], shape=(193888, 6))

In [12]:
train_generator.num_batches

6059