In [13]:
import torch
import torch.nn as nn
import numpy as np
from pandas import read_csv
import matplotlib.pyplot as plt
import pandas as pd
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler

In [5]:
file_path = r'D:\project-rice-yield-prediction\datasets and notes\BD-rice(ok).xlsx'
df= pd.read_excel(file_path,sheet_name='Aman',skiprows=1)

In [6]:
df

Unnamed: 0,rice,year,district,weather district,division,yield,precip_1,precip_2,precip_5,precip_6,temp_1,temp_2,temp_5,temp_6
0,Aman,2006,Bagerhat,Khulna,Khulna,0.728570,16.838710,11.741935,0.033333,0.000000,28.9,28.7,24.1,20.2
1,Aman,2007,Bagerhat,Khulna,Khulna,0.522331,19.064516,5.161290,3.766667,0.000000,28.6,29.3,24.2,19.1
2,Aman,2008,Bagerhat,Khulna,Khulna,0.663455,9.709677,6.516129,0.000000,0.000000,28.5,29.0,24.1,20.6
3,Aman,2009,Bagerhat,Khulna,Khulna,0.769729,11.193548,18.322581,0.666667,0.000000,29.0,29.0,24.5,19.3
4,Aman,2010,Bagerhat,Khulna,Khulna,0.710355,5.806452,6.612903,0.000000,0.419355,29.6,29.7,25.1,19.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1318,Aman,2017,Thakurgaon,Dinajpur,Rangpur,1.138931,6.290323,21.129032,0.000000,0.000000,29.3,29.6,22.1,19.2
1319,Aman,2018,Thakurgaon,Dinajpur,Rangpur,1.159884,5.322581,5.290323,0.000000,0.322581,29.9,30.1,21.8,17.3
1320,Aman,2019,Thakurgaon,Dinajpur,Rangpur,1.170860,21.935484,10.580645,0.000000,0.000000,29.4,30.4,23.2,16.5
1321,Aman,2020,Thakurgaon,Dinajpur,Rangpur,1.083696,19.935484,10.258065,0.100000,0.000000,28.9,30.2,22.0,17.7


In [7]:
# Filter out the yield data
data = df[df['yield'].notnull()]

In [8]:
# Dropping the specified columns
data_dropped = data.drop(columns=['rice', 'weather district', 'division'])

# Display the remaining columns
print(data_dropped.columns)


Index(['year', 'district', 'yield', 'precip_1', 'precip_2', 'precip_5',
       'precip_6', 'temp_1', 'temp_2', 'temp_5', 'temp_6'],
      dtype='object')


In [9]:
df = pd.DataFrame(data_dropped)

# Workflow 1: Backward Padding for districts starting from 2006
# Filter districts starting from 2006
start_2006 = df[df['year'] >= 2006].groupby('district')

# Create missing rows for years 1993-2005 for each district starting in 2006
for district, group in start_2006:
    missing_years = pd.DataFrame({
        'district': [district] * (2006 - 1993),
        'year': list(range(1993, 2006)),
        'yield': [group['yield'].iloc[0]] * (2006 - 1993),  # Backward fill from 2006
        'precip_1': [group['precip_1'].iloc[0]] * (2006 - 1993),
        'precip_2': [group['precip_2'].iloc[0]] * (2006 - 1993),
        'precip_5': [group['precip_5'].iloc[0]] * (2006 - 1993),
        'precip_6': [group['precip_6'].iloc[0]] * (2006 - 1993),
        'temp_1': [group['temp_1'].iloc[0]] * (2006 - 1993),
        'temp_2': [group['temp_2'].iloc[0]] * (2006 - 1993),
        'temp_5': [group['temp_5'].iloc[0]] * (2006 - 1993),
        'temp_6': [group['temp_6'].iloc[0]] * (2006 - 1993)
    })
    # Append the missing years back to the original dataframe
    df = pd.concat([df, missing_years], ignore_index=True)

# Sort the dataframe by district and year for clear order
df = df.sort_values(by=['district', 'year']).reset_index(drop=True)


df.head()


Unnamed: 0,year,district,yield,precip_1,precip_2,precip_5,precip_6,temp_1,temp_2,temp_5,temp_6
0,1993,Bagerhat,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2
1,1994,Bagerhat,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2
2,1995,Bagerhat,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2
3,1996,Bagerhat,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2
4,1997,Bagerhat,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2


In [10]:
# Set 'district' and 'year' as the index and keep the necessary columns
yield_data = df.set_index(['district', 'year'])[['yield','precip_1', 'precip_2', 'precip_5', 'precip_6', 
                                               'temp_1', 'temp_2', 'temp_5', 'temp_6',]]

# Display the reshaped DataFrame
yield_data.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,yield,precip_1,precip_2,precip_5,precip_6,temp_1,temp_2,temp_5,temp_6
district,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bagerhat,1993,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2
Bagerhat,1994,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2
Bagerhat,1995,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2
Bagerhat,1996,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2
Bagerhat,1997,0.72857,16.83871,11.741935,0.033333,0.0,28.9,28.7,24.1,20.2


In [11]:
yield_data.shape

(2155, 9)

In [14]:
# Normalize the data: Drop the target variable 'yield' and categorical columns
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(yield_data.drop(columns=['yield']))

# Create a DataFrame for the scaled data
scaled_df = pd.DataFrame(data=scaled, columns=yield_data.drop(columns=['yield']).columns)

In [15]:
# Check the normalized data
scaled_df.head()

Unnamed: 0,precip_1,precip_2,precip_5,precip_6,temp_1,temp_2,temp_5,temp_6
0,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429
1,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429
2,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429
3,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429
4,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429


In [16]:
# Create a copy of the original DataFrame
yield_data_scaled = yield_data.copy()

# Replace the relevant columns with the scaled values
yield_data_scaled[scaled_df.columns] = np.array(scaled_df)

# Set the index of the DataFrame to 'district' and 'year'
yield_data_scaled = yield_data_scaled.reset_index().set_index(['district', 'year'])

# Display the first few rows of the updated DataFrame
yield_data_scaled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,yield,precip_1,precip_2,precip_5,precip_6,temp_1,temp_2,temp_5,temp_6
district,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bagerhat,1993,0.72857,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429
Bagerhat,1994,0.72857,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429
Bagerhat,1995,0.72857,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429
Bagerhat,1996,0.72857,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429
Bagerhat,1997,0.72857,0.293917,0.245844,0.00274,0.0,0.617647,0.484848,0.681818,0.571429


In [17]:

# Function to split the data into train and test sets
def train_test_split(data):
    # Calculate the size of the training set (80% of the data)
    size = int(len(data) * 0.8)

    # For train data, collect 80% of the data, dropping the target variable 'yield'
    x_train = data.drop(columns=['yield']).iloc[0:size].values.astype('float32')
    
    # For test data, collect the remaining 20% of the data
    x_test = data.drop(columns=['yield']).iloc[size:].values.astype('float32')

    # Define the target variable for training and testing
    y_train = data['yield'].iloc[0:size].values.astype('float32')
    y_test = data['yield'].iloc[size:].values.astype('float32')

    # Convert the numpy arrays to PyTorch tensors
    x_train_tensor = torch.tensor(x_train)
    x_test_tensor = torch.tensor(x_test)
    y_train_tensor = torch.tensor(y_train)
    y_test_tensor = torch.tensor(y_test)

    return x_train_tensor, x_test_tensor, y_train_tensor, y_test_tensor




In [18]:
yield_data_scaled.shape

(2155, 9)

In [19]:
import pandas as pd
import torch

# Function to split the data into train and test sets
def train_test_split(data):
    # Calculate the size of the training set (80% of the data)
    size = int(len(data) * 0.8)

    # For train data, collect 80% of the data, dropping the target variable 'yield'
    x_train = data.drop(columns=['yield']).iloc[0:size].values.astype('float32')
    
    # For test data, collect the remaining 20% of the data
    x_test = data.drop(columns=['yield']).iloc[size:].values.astype('float32')

    # Define the target variable for training and testing
    y_train = data['yield'].iloc[0:size].values.astype('float32')
    y_test = data['yield'].iloc[size:].values.astype('float32')

    # Convert the numpy arrays to PyTorch tensors
    x_train_tensor = torch.tensor(x_train)
    x_test_tensor = torch.tensor(x_test)
    y_train_tensor = torch.tensor(y_train)
    y_test_tensor = torch.tensor(y_test)

    return x_train_tensor, x_test_tensor, y_train_tensor, y_test_tensor

# Example MultiIndex DataFrame (replace with your actual data)
# yield_data_scaled = pd.read_csv('your_dataset.csv', index_col=['district', 'year'])
# Get unique districts
districts = yield_data_scaled.index.levels[0]  # Get the unique districts from the first level of the index

# Initialize empty lists for training and testing data
X_train = []
X_test = []
Y_train = []
Y_test = []

# Iterate over each unique district
for district in districts:
    data = yield_data_scaled.xs(district, level='district')  # Get data for the current district
    
    # Optionally, reset the index and add the district back
    data_reset = data.reset_index()  # This will make 'year' a column
    data_reset['district'] = district  # Add the district back as a column
    data_reset = data_reset.set_index(['district', 'year'])  # Set the index back to district and year

    # Apply the function to split the data
    x_train, x_test, y_train, y_test = train_test_split(data_reset) 
    X_train.append(x_train)
    X_test.append(x_test)
    Y_train.append(y_train)
    Y_test.append(y_test)

# Convert the lists of tensors to single tensors for each set
X_train_tensor = torch.cat(X_train, dim=0)
X_test_tensor = torch.cat(X_test, dim=0)
Y_train_tensor = torch.cat(Y_train, dim=0)
Y_test_tensor = torch.cat(Y_test, dim=0)

# Example output (shape of the tensors)
print(f'X_train shape: {X_train_tensor.shape}')
print(f'X_test shape: {X_test_tensor.shape}')
print(f'Y_train shape: {Y_train_tensor.shape}')
print(f'Y_test shape: {Y_test_tensor.shape}')


X_train shape: torch.Size([1702, 8])
X_test shape: torch.Size([453, 8])
Y_train shape: torch.Size([1702])
Y_test shape: torch.Size([453])


In [21]:
# Assuming X_train[0] is a PyTorch tensor
X_train_numpy = X_train[0].numpy()  # Convert to NumPy array
print(X_train_numpy[:5])  # Print the first 5 rows


[[0.2939166  0.24584427 0.00273973 0.         0.61764705 0.4848485
  0.6818182  0.5714286 ]
 [0.2939166  0.24584427 0.00273973 0.         0.61764705 0.4848485
  0.6818182  0.5714286 ]
 [0.2939166  0.24584427 0.00273973 0.         0.61764705 0.4848485
  0.6818182  0.5714286 ]
 [0.2939166  0.24584427 0.00273973 0.         0.61764705 0.4848485
  0.6818182  0.5714286 ]
 [0.2939166  0.24584427 0.00273973 0.         0.61764705 0.4848485
  0.6818182  0.5714286 ]]


In [23]:


# Assuming X_train, Y_train, X_test, and Y_test are lists of PyTorch tensors
# Concatenate each train dataset in X_train list and Y_train list respectively
X_train_tensor = torch.cat(X_train, dim=0)  # Concatenate along the first dimension (rows)
Y_train_tensor = torch.cat(Y_train, dim=0).view(-1, 1)  # Concatenate and reshape if needed (e.g., for a single target variable)

# Concatenate each test dataset in X_test list and Y_test list respectively
X_test_tensor = torch.cat(X_test, dim=0)  # Concatenate along the first dimension (rows)
Y_test_tensor = torch.cat(Y_test, dim=0).view(-1, 1)  # Concatenate and reshape if needed

# Example of checking the shape of the concatenated tensors
print("Shape of concatenated X_train:", X_train_tensor.shape)
print("Shape of concatenated Y_train:", Y_train_tensor.shape)
print("Shape of concatenated X_test:", X_test_tensor.shape)
print("Shape of concatenated Y_test:", Y_test_tensor.shape)


Shape of concatenated X_train: torch.Size([1702, 8])
Shape of concatenated Y_train: torch.Size([1702, 1])
Shape of concatenated X_test: torch.Size([453, 8])
Shape of concatenated Y_test: torch.Size([453, 1])


NameError: name 'Sequential' is not defined