In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("final_weather.csv")
df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)

# Assign season and season_year (separating Winter and Rabi)
def assign_season(row):
    y, m = row['year'], row['month']
    
    if m in [11, 12]:  # Winter start
        return pd.Series({'season': 'Winter', 'season_year': y })
    elif m == 1:
        return pd.Series({'season': 'Winter', 'season_year': y-1})
    elif m in [2, 3]:
        return pd.Series({'season': 'Rabi', 'season_year': y})
    elif m in [4, 5]:
        return pd.Series({'season': 'Summer', 'season_year': y})
    elif m in [6, 7, 8]:
        return pd.Series({'season': 'Kharif', 'season_year': y})
    elif m in [9, 10]:
        return pd.Series({'season': 'Autumn', 'season_year': y})
    else:
        return pd.Series({'season': None, 'season_year': None})

df[['season', 'season_year']] = df.apply(assign_season, axis=1)

# Whole Year: Nov of previous year to Oct of current year
df['whole_year'] = df.apply(
    lambda row: row['year'] if row['month'] in [2, 3,4,5,6,7,8,9,10,11,12] else row['year']-1,
    axis=1
)

whole_df = df.groupby(['state', 'district', 'whole_year']).agg({
    'temperature_2m_mean': 'mean',
    'precipitation_sum':'mean',
    'relative_humidity_2m_mean':'mean',
    'wind_speed_10m_mean':'mean',
    'latitude': 'first',
    'longitude': 'first'
}).reset_index().rename(columns={'whole_year': 'season_year'})
whole_df['season'] = 'Whole Year'

# Seasonal aggregation
seasonal_df = df.groupby(['state', 'district', 'season_year', 'season']).agg({
    'temperature_2m_mean': 'mean',
    'precipitation_sum':'mean',
    'relative_humidity_2m_mean':'mean',
    'wind_speed_10m_mean':'mean',
    'latitude': 'first',
    'longitude': 'first'
}).reset_index()

# Merge all
final_df = pd.concat([seasonal_df, whole_df], ignore_index=True)
final_df = final_df.sort_values(by=['state', 'district', 'season_year', 'season']).reset_index(drop=True)

# Preview
final_df


Unnamed: 0,state,district,season_year,season,temperature_2m_mean,precipitation_sum,relative_humidity_2m_mean,wind_speed_10m_mean,latitude,longitude
0,Andhra Pradesh,Anantapur,2010,Whole Year,23.990323,0.022581,50.258065,10.270968,14.724220,77.430674
1,Andhra Pradesh,Anantapur,2010,Winter,23.990323,0.022581,50.258065,10.270968,14.724220,77.430674
2,Andhra Pradesh,Anantapur,2011,Autumn,26.495968,1.685484,69.622043,11.964624,14.724220,77.430674
3,Andhra Pradesh,Anantapur,2011,Kharif,26.372581,3.022760,71.812545,19.778638,14.724220,77.430674
4,Andhra Pradesh,Anantapur,2011,Rabi,27.880645,0.007143,37.172235,10.267684,14.724220,77.430674
...,...,...,...,...,...,...,...,...,...,...
42879,West Bengal,Uttar Dinajpur,2021,Kharif,28.091792,12.664301,87.018638,9.174516,25.872143,87.961986
42880,West Bengal,Uttar Dinajpur,2021,Rabi,22.617396,0.051959,58.408986,8.314804,25.872143,87.961986
42881,West Bengal,Uttar Dinajpur,2021,Summer,27.483602,4.345699,68.403763,10.042849,25.872143,87.961986
42882,West Bengal,Uttar Dinajpur,2021,Whole Year,25.295705,5.534100,75.406619,8.547511,25.872143,87.961986


In [2]:
agriculture=pd.read_csv('updated_crop.csv')

In [3]:
agriculture

Unnamed: 0,year,state_name,district_name,season,crop_name,area,production
0,2010,Andhra Pradesh,Anantapur,Whole Year,Arecanut,409.0,182.0
1,2010,Andhra Pradesh,Krishna,Whole Year,Arecanut,9.0,4.0
2,2010,Andhra Pradesh,West Godavari,Whole Year,Arecanut,20.0,9.0
3,2010,Telangana,Adilabad,Kharif,Arhar/Tur,47884.0,26193.0
4,2010,Telangana,Adilabad,Rabi,Arhar/Tur,1648.0,901.0
...,...,...,...,...,...,...,...
162976,2020,Uttarakhand,Pithoragarh,Rabi,Wheat,16850.0,28415.0
162977,2020,Uttarakhand,Rudra Prayag,Rabi,Wheat,9155.0,8709.0
162978,2020,Uttarakhand,Tehri Garhwal,Rabi,Wheat,15451.0,28096.0
162979,2020,Uttarakhand,Udam Singh Nagar,Rabi,Wheat,105961.0,471000.0


In [4]:
agriculture.isnull().sum()

year             0
state_name       0
district_name    0
season           0
crop_name        0
area             0
production       0
dtype: int64

In [5]:
agriculture.columns

Index(['year', 'state_name', 'district_name', 'season', 'crop_name', 'area',
       'production'],
      dtype='object')

In [6]:
final_df.columns

Index(['state', 'district', 'season_year', 'season', 'temperature_2m_mean',
       'precipitation_sum', 'relative_humidity_2m_mean', 'wind_speed_10m_mean',
       'latitude', 'longitude'],
      dtype='object')

In [8]:
import pandas as pd



# Ensure consistent column naming and case
agriculture['state_name'] =agriculture['state_name'].str.strip().str.lower()
agriculture['district_name'] = agriculture['district_name'].str.strip().str.lower()
agriculture['season'] = agriculture['season'].str.strip().str.lower()

final_df['state'] = final_df['state'].str.strip().str.lower()
final_df['district'] = final_df['district'].str.strip().str.lower()
final_df['season'] = final_df['season'].str.strip().str.lower()

# Merge on the matching keys
merged_df = pd.merge(
    agriculture,
    final_df,
    left_on=['state_name', 'district_name', 'year', 'season'],
    right_on=['state', 'district', 'season_year', 'season'],
    how='left'
)

# Drop extra matching columns (state, district, etc.)
merged_df = merged_df.drop(columns=['state', 'district', 'season_year'])

# Optional: reorder or check columns
print(merged_df.columns)
merged_df


Index(['year', 'state_name', 'district_name', 'season', 'crop_name', 'area',
       'production', 'temperature_2m_mean', 'precipitation_sum',
       'relative_humidity_2m_mean', 'wind_speed_10m_mean', 'latitude',
       'longitude'],
      dtype='object')


Unnamed: 0,year,state_name,district_name,season,crop_name,area,production,temperature_2m_mean,precipitation_sum,relative_humidity_2m_mean,wind_speed_10m_mean,latitude,longitude
0,2010,andhra pradesh,anantapur,whole year,Arecanut,409.0,182.0,23.990323,0.022581,50.258065,10.270968,14.724220,77.430674
1,2010,andhra pradesh,krishna,whole year,Arecanut,9.0,4.0,23.129032,0.051613,78.645161,7.929032,16.212770,81.030696
2,2010,andhra pradesh,west godavari,whole year,Arecanut,20.0,9.0,23.587097,0.000000,74.548387,7.635484,16.644303,81.588525
3,2010,telangana,adilabad,kharif,Arhar/Tur,47884.0,26193.0,,,,,,
4,2010,telangana,adilabad,rabi,Arhar/Tur,1648.0,901.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162976,2020,uttarakhand,pithoragarh,rabi,Wheat,16850.0,28415.0,11.081924,3.072747,71.819244,3.548943,29.585871,80.215167
162977,2020,uttarakhand,rudra prayag,rabi,Wheat,9155.0,8709.0,6.031479,6.103337,50.872080,3.980590,30.608720,79.065170
162978,2020,uttarakhand,tehri garhwal,rabi,Wheat,15451.0,28096.0,8.545439,2.390879,74.800890,5.159622,30.465818,78.483735
162979,2020,uttarakhand,udam singh nagar,rabi,Wheat,105961.0,471000.0,18.012514,1.905673,72.485539,9.747052,29.000000,79.416700


In [9]:
merged_df=merged_df.dropna()

In [10]:
merged_df

Unnamed: 0,year,state_name,district_name,season,crop_name,area,production,temperature_2m_mean,precipitation_sum,relative_humidity_2m_mean,wind_speed_10m_mean,latitude,longitude
0,2010,andhra pradesh,anantapur,whole year,Arecanut,409.0,182.0,23.990323,0.022581,50.258065,10.270968,14.724220,77.430674
1,2010,andhra pradesh,krishna,whole year,Arecanut,9.0,4.0,23.129032,0.051613,78.645161,7.929032,16.212770,81.030696
2,2010,andhra pradesh,west godavari,whole year,Arecanut,20.0,9.0,23.587097,0.000000,74.548387,7.635484,16.644303,81.588525
72,2010,telangana,adilabad,whole year,Banana,6.0,196.0,22.319355,0.000000,40.483871,6.738710,19.675945,78.533990
73,2010,andhra pradesh,anantapur,whole year,Banana,4416.0,223825.0,23.990323,0.022581,50.258065,10.270968,14.724220,77.430674
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162976,2020,uttarakhand,pithoragarh,rabi,Wheat,16850.0,28415.0,11.081924,3.072747,71.819244,3.548943,29.585871,80.215167
162977,2020,uttarakhand,rudra prayag,rabi,Wheat,9155.0,8709.0,6.031479,6.103337,50.872080,3.980590,30.608720,79.065170
162978,2020,uttarakhand,tehri garhwal,rabi,Wheat,15451.0,28096.0,8.545439,2.390879,74.800890,5.159622,30.465818,78.483735
162979,2020,uttarakhand,udam singh nagar,rabi,Wheat,105961.0,471000.0,18.012514,1.905673,72.485539,9.747052,29.000000,79.416700


In [11]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 149540 entries, 0 to 162980
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   year                       149540 non-null  int64  
 1   state_name                 149540 non-null  object 
 2   district_name              149540 non-null  object 
 3   season                     149540 non-null  object 
 4   crop_name                  149540 non-null  object 
 5   area                       149540 non-null  float64
 6   production                 149540 non-null  float64
 7   temperature_2m_mean        149540 non-null  float64
 8   precipitation_sum          149540 non-null  float64
 9   relative_humidity_2m_mean  149540 non-null  float64
 10  wind_speed_10m_mean        149540 non-null  float64
 11  latitude                   149540 non-null  float64
 12  longitude                  149540 non-null  float64
dtypes: float64(8), int64(1), object(4)

In [12]:
len(merged_df['district_name'].unique())

623

In [13]:
merged_df = merged_df[merged_df['year'] >= 2011]

merged_df

Unnamed: 0,year,state_name,district_name,season,crop_name,area,production,temperature_2m_mean,precipitation_sum,relative_humidity_2m_mean,wind_speed_10m_mean,latitude,longitude
14564,2011,andhra pradesh,anantapur,whole year,Arecanut,541.0,267.0,26.831398,1.345787,58.374674,13.130421,14.724220,77.430674
14565,2011,andhra pradesh,east godavari,whole year,Arecanut,20.0,8.0,27.594816,2.444082,72.726210,8.447894,16.995664,81.715438
14566,2011,andhra pradesh,vizianagaram,whole year,Arecanut,1.0,8.0,26.460210,1.814801,76.999968,11.643457,18.114126,83.411439
14567,2011,telangana,adilabad,kharif,Arhar/Tur,38832.0,23260.0,28.177563,5.219892,72.377061,11.180717,19.675945,78.533990
14568,2011,telangana,adilabad,rabi,Arhar/Tur,591.0,354.0,27.838422,0.271141,36.776498,6.911118,19.675945,78.533990
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162976,2020,uttarakhand,pithoragarh,rabi,Wheat,16850.0,28415.0,11.081924,3.072747,71.819244,3.548943,29.585871,80.215167
162977,2020,uttarakhand,rudra prayag,rabi,Wheat,9155.0,8709.0,6.031479,6.103337,50.872080,3.980590,30.608720,79.065170
162978,2020,uttarakhand,tehri garhwal,rabi,Wheat,15451.0,28096.0,8.545439,2.390879,74.800890,5.159622,30.465818,78.483735
162979,2020,uttarakhand,udam singh nagar,rabi,Wheat,105961.0,471000.0,18.012514,1.905673,72.485539,9.747052,29.000000,79.416700


In [14]:
merged_df.to_csv('data_set_of_2011-20.csv',index=False)