# Notebook 01: Data Collection and Loading

# Cell 1: Import necessary libraries

In [18]:
import pandas as pd
import numpy as np

In [None]:
print("="*80)
print("Notebook 01:Data Collection and Loading ")
print("="*80)

Notebook 01:Data Collection and Loading


In [20]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",None)
pd.set_option("display.width",None)
pd.set_option("display.float_format",'{:.2f}'.format)
print("\n Display options configured successfully!")


 Display options configured successfully!


Loading primary dataset

In [23]:
df_crop = pd.read_csv(r"D:\AI-Powered Crop Recommendation System with Explainable AI and Economic Analysis\data\raw\Crop_recommendation.csv")
print(f"Shape:{df_crop.shape}")
print(f"Memory Usage:{df_crop.memory_usage(deep=True).sum()/1024:.2f} KB")

Shape:(2200, 8)
Memory Usage:241.05 KB


# Initial Data Exploration

In [29]:
print("\n1. COLUMN NAMES AND DATA TYPES")
print("-" * 40)
print(df_crop.dtypes)


1. COLUMN NAMES AND DATA TYPES
----------------------------------------
N                int64
P                int64
K                int64
temperature    float64
humidity       float64
ph             float64
rainfall       float64
label           object
dtype: object


In [30]:
print("\n2. FIRST 10 ROWS")
print("-" * 40)
print(df_crop.head(10))



2. FIRST 10 ROWS
----------------------------------------
    N   P   K  temperature  humidity   ph  rainfall label
0  90  42  43        20.88     82.00 6.50    202.94  rice
1  85  58  41        21.77     80.32 7.04    226.66  rice
2  60  55  44        23.00     82.32 7.84    263.96  rice
3  74  35  40        26.49     80.16 6.98    242.86  rice
4  78  42  42        20.13     81.60 7.63    262.72  rice
5  69  37  42        23.06     83.37 7.07    251.05  rice
6  69  55  38        22.71     82.64 5.70    271.32  rice
7  94  53  40        20.28     82.89 5.72    241.97  rice
8  89  54  38        24.52     83.54 6.69    230.45  rice
9  68  58  38        23.22     83.03 6.34    221.21  rice


In [31]:
print("\n3. LAST 5 ROWS")
print("-" * 40)
print(df_crop.tail())


3. LAST 5 ROWS
----------------------------------------
        N   P   K  temperature  humidity   ph  rainfall   label
2195  107  34  32        26.77     66.41 6.78    177.77  coffee
2196   99  15  27        27.42     56.64 6.09    127.92  coffee
2197  118  33  30        24.13     67.23 6.36    173.32  coffee
2198  117  32  34        26.27     52.13 6.76    127.18  coffee
2199  104  18  30        23.60     60.40 6.78    140.94  coffee


In [32]:
print("\n4. RANDOM SAMPLE (5 rows)")
print("-" * 40)
print(df_crop.sample(5,random_state=42))


4. RANDOM SAMPLE (5 rows)
----------------------------------------
        N    P    K  temperature  humidity   ph  rainfall       label
1451  101   17   47        29.49     94.73 6.19     26.31   muskmelon
1334   98    8   51        26.18     86.52 6.26     49.43  watermelon
1761   59   62   49        43.36     93.35 6.94    114.78      papaya
1735   44   60   55        34.28     90.56 6.83     98.54      papaya
1576   30  137  200        22.91     90.70 5.60    118.60       apple


# Dataset Structure Analysis


In [33]:
df_crop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [34]:
print(f"Total rows:{df_crop.shape[0]:,}")

Total rows:2,200


In [35]:
print(f"Total columns:{df_crop.shape[1]:,}")

Total columns:8


In [37]:
print(f"Total Data Points:{df_crop.shape[0]* df_crop.shape[1]:,}")

Total Data Points:17,600


In [54]:
print("Column Details")
print("-"*40)

for col in df_crop.columns:
    print(f"{col:15} | Type: {str(df_crop[col].dtype):10} |"
          f"Non-Null: {df_crop[col].count():6} |"
          f"Unique :{df_crop[col].nunique():6}")

Column Details
----------------------------------------
N               | Type: int64      |Non-Null:   2200 |Unique :   137
P               | Type: int64      |Non-Null:   2200 |Unique :   117
K               | Type: int64      |Non-Null:   2200 |Unique :    73
temperature     | Type: float64    |Non-Null:   2200 |Unique :  2200
humidity        | Type: float64    |Non-Null:   2200 |Unique :  2200
ph              | Type: float64    |Non-Null:   2200 |Unique :  2200
rainfall        | Type: float64    |Non-Null:   2200 |Unique :  2200
label           | Type: object     |Non-Null:   2200 |Unique :    22


# Missing Values Analysis

In [56]:
missing_values=df_crop.isnull().sum()
missing_percentage=(missing_values/len(df_crop))*100

In [57]:
missing_df=pd.DataFrame({
    "Column":df_crop.columns,
    "Missing_Count":missing_values.values,
    "Missing Percentage":missing_percentage.values
})

In [63]:
print("Missing Values Summary")
missing_df

Missing Values Summary


Unnamed: 0,Column,Missing_Count,Missing Percentage
0,N,0,0.0
1,P,0,0.0
2,K,0,0.0
3,temperature,0,0.0
4,humidity,0,0.0
5,ph,0,0.0
6,rainfall,0,0.0
7,label,0,0.0


In [64]:
total_missing = missing_values.sum()

In [65]:
if total_missing == 0:
    print("\nNo missing values detected in the dataset.")
else:
    print(f"\nTotal Missing Values in Dataset: {total_missing}")


No missing values detected in the dataset.


Dataset is fully cleaned as it doesn't have any null values

# Duplicate Record Analysis

In [69]:
duplicates=df_crop.duplicated().sum()
duplicate_percentage=(duplicates/len(df_crop))*100

In [70]:
print(f"Total Duplicates: {duplicates}")
print(f"Duplicate Percentage: {duplicate_percentage:.2f}%")

Total Duplicates: 0
Duplicate Percentage: 0.00%


In [72]:
if duplicates == 0:
    print("No duplicate records found in the dataset.")
else:
    print(f"Total duplicate records found: {duplicates}")
    print(df_crop[df_crop.duplicated(keep=False)].head())

No duplicate records found in the dataset.


# Target Variable Analysis (Crop Labels)

In [78]:
crop_counts=df_crop["label"].value_counts()
crop_percentage=(crop_counts/len(df_crop))*100

In [83]:
crop_counts

label
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
jute           100
coffee         100
Name: count, dtype: int64

In [81]:
crop_distribution=pd.DataFrame({
    "Crop":crop_counts.index,
    "Count":crop_counts.values,
    "Percentage":crop_percentage.values
})

crop_distribution

Unnamed: 0,Crop,Count,Percentage
0,rice,100,4.55
1,maize,100,4.55
2,chickpea,100,4.55
3,kidneybeans,100,4.55
4,pigeonpeas,100,4.55
5,mothbeans,100,4.55
6,mungbean,100,4.55
7,blackgram,100,4.55
8,lentil,100,4.55
9,pomegranate,100,4.55


In [85]:
print(f"Total unique crops:{df_crop['label'].nunique()}")
print(f"Most common crop:{crop_counts.index[0]} {crop_counts.values[0]}")
print(f"Least common crop:{crop_counts.index[-1]} ({crop_counts.values[-1]} samples)")

Total unique crops:22
Most common crop:rice 100
Least common crop:coffee (100 samples)


Check class balance

In [87]:
if crop_counts.nunique() ==1:
    print("Perfectly balanced dataset. all crops have equal representation.")
else:
    print("Dataset is imbalanced. Some crops have significantly more samples than others.")

Perfectly balanced dataset. all crops have equal representation.


In [91]:
print("List of all crops")
all_crops = sorted(df_crop['label'].unique())
for i,crop in enumerate(all_crops,1):
    print(f"{i:2}. {crop}")

List of all crops
 1. apple
 2. banana
 3. blackgram
 4. chickpea
 5. coconut
 6. coffee
 7. cotton
 8. grapes
 9. jute
10. kidneybeans
11. lentil
12. maize
13. mango
14. mothbeans
15. mungbean
16. muskmelon
17. orange
18. papaya
19. pigeonpeas
20. pomegranate
21. rice
22. watermelon


# Numerical Features Statistical Summary

In [92]:
print(df_crop.describe())

            N       P       K  temperature  humidity      ph  rainfall
count 2200.00 2200.00 2200.00      2200.00   2200.00 2200.00   2200.00
mean    50.55   53.36   48.15        25.62     71.48    6.47    103.46
std     36.92   32.99   50.65         5.06     22.26    0.77     54.96
min      0.00    5.00    5.00         8.83     14.26    3.50     20.21
25%     21.00   28.00   20.00        22.77     60.26    5.97     64.55
50%     37.00   51.00   32.00        25.60     80.47    6.43     94.87
75%     84.25   68.00   49.00        28.56     89.95    6.92    124.27
max    140.00  145.00  205.00        43.68     99.98    9.94    298.56


In [95]:
numerical_cols=df_crop.select_dtypes(include=[np.number]).columns

In [102]:
numerical_cols

Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall'], dtype='object')

In [114]:
stats_df=pd.DataFrame({
    "Feature": numerical_cols,
    "Mean":[df_crop[col].mean() for col in numerical_cols],
    'Median':[df_crop[col].median() for col in numerical_cols],
    'Std':[df_crop[col].std() for col in numerical_cols],
    'Min': [df_crop[col].min() for col in numerical_cols],
    'Max': [df_crop[col].max() for col in numerical_cols],
    'Range': [df_crop[col].max() - df_crop[col].min() for col in numerical_cols],
    'Variance': [df_crop[col].var() for col in numerical_cols],
    'Skewness': [df_crop[col].skew() for col in numerical_cols],
    'Kurtosis': [df_crop[col].kurtosis() for col in numerical_cols]
})

In [116]:
print(stats_df)

       Feature   Mean  Median   Std   Min    Max  Range  Variance  Skewness  \
0            N  50.55   37.00 36.92  0.00 140.00 140.00   1362.89      0.51   
1            P  53.36   51.00 32.99  5.00 145.00 140.00   1088.07      1.01   
2            K  48.15   32.00 50.65  5.00 205.00 200.00   2565.21      2.38   
3  temperature  25.62   25.60  5.06  8.83  43.68  34.85     25.64      0.18   
4     humidity  71.48   80.47 22.26 14.26  99.98  85.72    495.68     -1.09   
5           ph   6.47    6.43  0.77  3.50   9.94   6.43      0.60      0.28   
6     rainfall 103.46   94.87 54.96 20.21 298.56 278.35   3020.42      0.97   

   Kurtosis  
0     -1.06  
1      0.86  
2      4.45  
3      1.23  
4      0.30  
5      1.66  
6      0.61  


# Feature Range Analysis