# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_original = pd.read_csv('../Datasets/original_yield_dataset.csv')

In [3]:
df_original.shape

(74975, 7)

In [4]:
df_original.head()

Unnamed: 0,State,Year,Season,Crop,Area,Rainfall,Production
0,Andaman and Nicobar Islands,2000,Kharif,Arecanut,1254.0,2763.2,2000.0
1,Andaman and Nicobar Islands,2000,Kharif,Other Kharif pulses,2.0,2763.2,1.0
2,Andaman and Nicobar Islands,2000,Kharif,Rice,102.0,2763.2,321.0
3,Andaman and Nicobar Islands,2000,Whole Year,Banana,176.0,2763.2,641.0
4,Andaman and Nicobar Islands,2000,Whole Year,Cashewnut,720.0,2763.2,165.0


In [5]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74975 entries, 0 to 74974
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   State       74975 non-null  object 
 1   Year        74975 non-null  int64  
 2   Season      74975 non-null  object 
 3   Crop        74975 non-null  object 
 4   Area        74975 non-null  float64
 5   Rainfall    74975 non-null  float64
 6   Production  74975 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 4.0+ MB


##### There are many whites spaces which are causing Error ,Lets remove them

In [6]:
df_original = df_original.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [7]:
print(len(df_original.Crop.unique()))

86


In [8]:
unique_crops = df_original.Crop.unique()
print(np.sort(unique_crops))

['Arecanut' 'Arhar/Tur' 'Bajra' 'Banana' 'Barley'
 'Beans & Mutter(Vegetable)' 'Bhindi' 'Black pepper' 'Blackgram' 'Brinjal'
 'Cabbage' 'Cardamom' 'Carrot' 'Cashewnut' 'Cashewnut Raw' 'Castor seed'
 'Citrus Fruit' 'Coconut' 'Coffee' 'Cond-spcs other' 'Coriander'
 'Cotton(lint)' 'Drum Stick' 'Dry chillies' 'Dry ginger' 'Garlic' 'Ginger'
 'Gram' 'Grapes' 'Groundnut' 'Guar seed' 'Horse-gram' 'Jack Fruit' 'Jowar'
 'Jute' 'Khesari' 'Korra' 'Lentil' 'Linseed' 'Maize' 'Mango' 'Masoor'
 'Mesta' 'Moong(Green Gram)' 'Moth' 'Niger seed' 'Oilseeds total' 'Onion'
 'Orange' 'Other  Rabi pulses' 'Other Cereals & Millets'
 'Other Kharif pulses' 'Other Vegetables' 'Paddy' 'Papaya'
 'Peas & beans (Pulses)' 'Pineapple' 'Pome Fruit' 'Potato' 'Pulses total'
 'Ragi' 'Rapeseed &Mustard' 'Redish' 'Rice' 'Rubber' 'Safflower' 'Samai'
 'Sannhamp' 'Sesamum' 'Small millets' 'Soyabean' 'Sugarcane' 'Sunflower'
 'Sweet potato' 'Tapioca' 'Tea' 'Tobacco' 'Tomato' 'Total foodgrain'
 'Turmeric' 'Turnip' 'Urad' 'Varagu' '

In [9]:
pd.options.display.max_rows = 120
df_original['Crop'].value_counts()

Rice                         5792
Maize                        4827
Moong(Green Gram)            3532
Urad                         3171
Sesamum                      2808
Wheat                        2584
Sugarcane                    2576
Rapeseed &Mustard            2450
Potato                       2436
Groundnut                    2346
Ragi                         2183
Arhar/Tur                    2174
Horse-gram                   1987
Gram                         1920
Dry chillies                 1644
Onion                        1637
Small millets                1579
Turmeric                     1563
Sunflower                    1425
Dry ginger                   1387
Masoor                       1363
Sweet potato                 1334
Peas & beans (Pulses)        1331
Barley                       1258
Banana                       1164
Linseed                      1132
Coriander                    1123
Other Kharif pulses          1111
Garlic                       1072
Jowar         

In [10]:
unique_seasons = df_original.Season.unique()
print(np.sort(unique_seasons))

['Autumn' 'Kharif' 'Rabi' 'Summer' 'Whole Year' 'Winter']


In [11]:
df_original['Season'].value_counts()

Kharif        24375
Whole Year    20644
Rabi          17064
Winter         4808
Summer         4525
Autumn         3559
Name: Season, dtype: int64

##### replace Paddy -> Rice and Summer -> Kharif ,Autumn -> Rabi , Winter -> Rabi

In [12]:
df_original.replace(to_replace= {'Crop':{'Paddy': 'Rice'}, 'Season':{'Summer': 'Kharif', 'Autumn': 'Rabi', 'Winter': 'Rabi'}}, value=None, inplace=True)

In [13]:
df_original.Season.value_counts()

Kharif        28900
Rabi          25431
Whole Year    20644
Name: Season, dtype: int64

In [14]:
df_original.Crop.value_counts()

Rice                         5882
Maize                        4827
Moong(Green Gram)            3532
Urad                         3171
Sesamum                      2808
Wheat                        2584
Sugarcane                    2576
Rapeseed &Mustard            2450
Potato                       2436
Groundnut                    2346
Ragi                         2183
Arhar/Tur                    2174
Horse-gram                   1987
Gram                         1920
Dry chillies                 1644
Onion                        1637
Small millets                1579
Turmeric                     1563
Sunflower                    1425
Dry ginger                   1387
Masoor                       1363
Sweet potato                 1334
Peas & beans (Pulses)        1331
Barley                       1258
Banana                       1164
Linseed                      1132
Coriander                    1123
Other Kharif pulses          1111
Garlic                       1072
Jowar         

##### removed crops whose Frequency is less than 805 lets get a list

In [15]:
last_crops = [x for x in df_original.Crop.value_counts().sort_values(ascending = False).tail(50).index]

In [16]:
print(last_crops)

['Sannhamp', 'Mesta', 'Tapioca', 'Cashewnut', 'Tobacco', 'Coconut', 'Soyabean', 'Arecanut', 'Jute', 'Black pepper', 'Niger seed', 'Safflower', 'Oilseeds total', 'Other Cereals & Millets', 'Cardamom', 'other oilseeds', 'Pulses total', 'Guar seed', 'Mango', 'Total foodgrain', 'Jack Fruit', 'Ginger', 'Drum Stick', 'Papaya', 'Pineapple', 'Pome Fruit', 'Brinjal', 'Bhindi', 'Tomato', 'Citrus Fruit', 'Korra', 'Moth', 'Varagu', 'Orange', 'Samai', 'Blackgram', 'Rubber', 'Other Vegetables', 'Grapes', 'Cabbage', 'Cond-spcs other', 'Tea', 'Cashewnut Raw', 'Lentil', 'Coffee', 'Turnip', 'Beans & Mutter(Vegetable)', 'Redish', 'Carrot', 'other misc. pulses']


In [17]:
type(last_crops)

list

In [18]:
for label in last_crops:
  df_original.drop(df_original.index[(df_original["Crop"] == label)], axis=0, inplace=True)

In [19]:
type(df_original)

pandas.core.frame.DataFrame

In [20]:
df_original.Crop.value_counts()

Rice                     5882
Maize                    4827
Moong(Green Gram)        3532
Urad                     3171
Sesamum                  2808
Wheat                    2584
Sugarcane                2576
Rapeseed &Mustard        2450
Potato                   2436
Groundnut                2346
Ragi                     2183
Arhar/Tur                2174
Horse-gram               1987
Gram                     1920
Dry chillies             1644
Onion                    1637
Small millets            1579
Turmeric                 1563
Sunflower                1425
Dry ginger               1387
Masoor                   1363
Sweet potato             1334
Peas & beans (Pulses)    1331
Barley                   1258
Banana                   1164
Linseed                  1132
Coriander                1123
Other Kharif pulses      1111
Garlic                   1072
Jowar                    1070
Bajra                     930
Other  Rabi pulses        863
Khesari                   838
Cotton(lin

In [21]:
df_original.Season.value_counts()

Kharif        26281
Rabi          24645
Whole Year    15408
Name: Season, dtype: int64

In [22]:
print(len(df_original.Crop.value_counts()))

35


##### there are two insignificant Crops

In [23]:
waste_crops = ["Other Kharif pulses" ,"Other  Rabi pulses"]
for label_crop in waste_crops:
  df_original.drop(df_original.index[(df_original["Crop"] == label_crop)], axis=0, inplace=True)

In [24]:
len(df_original.Crop.unique())

33

In [25]:
df_original.Crop.value_counts()

Rice                     5882
Maize                    4827
Moong(Green Gram)        3532
Urad                     3171
Sesamum                  2808
Wheat                    2584
Sugarcane                2576
Rapeseed &Mustard        2450
Potato                   2436
Groundnut                2346
Ragi                     2183
Arhar/Tur                2174
Horse-gram               1987
Gram                     1920
Dry chillies             1644
Onion                    1637
Small millets            1579
Turmeric                 1563
Sunflower                1425
Dry ginger               1387
Masoor                   1363
Sweet potato             1334
Peas & beans (Pulses)    1331
Barley                   1258
Banana                   1164
Linseed                  1132
Coriander                1123
Garlic                   1072
Jowar                    1070
Bajra                     930
Khesari                   838
Cotton(lint)              829
Castor seed               805
Name: Crop

# Let's Save It

In [26]:
df_original.to_csv('../Datasets/cleaned_yield_dataset.csv', index=False)

In [27]:
df_cleaned = pd.read_csv('../Datasets/cleaned_yield_dataset.csv')

In [28]:
df_cleaned.head()

Unnamed: 0,State,Year,Season,Crop,Area,Rainfall,Production
0,Andaman and Nicobar Islands,2000,Kharif,Rice,102.0,2763.2,321.0
1,Andaman and Nicobar Islands,2000,Whole Year,Banana,176.0,2763.2,641.0
2,Andaman and Nicobar Islands,2000,Whole Year,Dry ginger,36.0,2763.2,100.0
3,Andaman and Nicobar Islands,2000,Whole Year,Sugarcane,1.0,2763.2,2.0
4,Andaman and Nicobar Islands,2000,Whole Year,Sweet potato,5.0,2763.2,15.0


In [29]:
df_cleaned.shape

(64360, 7)

In [30]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64360 entries, 0 to 64359
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   State       64360 non-null  object 
 1   Year        64360 non-null  int64  
 2   Season      64360 non-null  object 
 3   Crop        64360 non-null  object 
 4   Area        64360 non-null  float64
 5   Rainfall    64360 non-null  float64
 6   Production  64360 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 3.4+ MB
